# LightGBM PE Prediction Analysis - November 2025

This notebook trains a LightGBM model to predict PE ratios using October 2025 data to predict November 2025 PE ratios.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load and Prepare Data

In [None]:
# Load data
df = pd.read_parquet('data3.parquet')
df['month'] = pd.to_datetime(df['month'])

print(f"Total data shape: {df.shape}")
print(f"Date range: {df['month'].min()} to {df['month'].max()}")

In [None]:
# Define features
categorical_features = ['sector', 'industry', 'size']
numeric_features = [
    'roe', 'roa', 'grossmargin', 'netmargin', 'assetturnover',
    'equity_multiplier', 'payoutratio', 'gp_to_assets',
    'revenue_5y_growth', 'netinc_5y_growth', 'eps_5y_growth',
    'ebitda_5y_growth', 'assets_5y_growth', 'equity_5y_growth',
    'debt_5y_growth', 'cashneq_5y_growth', 'ncfo_5y_growth',
    'fcf_5y_growth', 'dps_5y_growth', 'payoutratio_5y_growth',
    'assetturnover_5y_growth', 'equity_multiplier_5y_growth',
    'gp_to_assets_5y_growth'
]
all_features = categorical_features + numeric_features

print(f"Total features: {len(all_features)}")
print(f"Categorical: {len(categorical_features)}, Numeric: {len(numeric_features)}")

In [None]:
# Remove rows with missing PE or features
df = df[df['pe'].notna() & (df['pe'] > 0)].copy()
df = df.dropna(subset=all_features)

print(f"Data shape after cleaning: {df.shape}")

## Train on October 2025, Predict November 2025

In [None]:
# Split data
train_month = pd.Timestamp('2025-10-01')
test_month = pd.Timestamp('2025-11-01')

train_data = df[df['month'] == train_month].copy()
test_data = df[df['month'] == test_month].copy()

print(f"Training data: {len(train_data)} observations")
print(f"Test data: {len(test_data)} observations")

In [None]:
# Prepare features and target
X_train = train_data[all_features].copy()
y_train = train_data['pe']

X_test = test_data[all_features].copy()
y_test = test_data['pe']

# Convert categorical features
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [None]:
# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'mape',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': 42
}

# Train model
print("Training LightGBM model...")
model = lgb.LGBMRegressor(n_estimators=500, **params)
model.fit(X_train, y_train, categorical_feature=categorical_features)
print("Training completed!")

## Make Predictions and Calculate Errors

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate percentage errors
pct_error = ((y_pred - y_test) / y_test) * 100

# Summary statistics
print("\nPrediction Error Statistics:")
print(f"Mean Absolute Percentage Error: {np.abs(pct_error).mean():.2f}%")
print(f"Median Absolute Percentage Error: {np.abs(pct_error).median():.2f}%")
print(f"RMSE (percentage): {np.sqrt((pct_error ** 2).mean()):.2f}%")
print(f"\nPercentage Error Distribution:")
print(pd.Series(pct_error).describe())

## Distribution of Percentage Errors

In [None]:
# Plot distribution of percentage errors
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(pct_error, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(pct_error.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {pct_error.mean():.2f}%')
axes[0].axvline(pct_error.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {pct_error.median():.2f}%')
axes[0].set_xlabel('Percentage Error (%)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Prediction Errors\n(Predicted PE - Actual PE) / Actual PE', fontsize=14)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(pct_error, vert=True)
axes[1].set_ylabel('Percentage Error (%)', fontsize=12)
axes[1].set_title('Box Plot of Prediction Errors', fontsize=14)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print percentiles
print("\nPercentiles of Percentage Error:")
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
    print(f"{p:2d}th percentile: {np.percentile(pct_error, p):7.2f}%")

## Feature Importance

In [None]:
# Get feature importance
importance_df = pd.DataFrame({
    'feature': all_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
plt.barh(range(20), importance_df['importance'].head(20))
plt.yticks(range(20), importance_df['feature'].head(20))
plt.xlabel('Importance (Gain)', fontsize=12)
plt.title('Top 20 Feature Importances - LightGBM Model', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(importance_df.head(10).to_string(index=False))

## Visualize Tree Structure

We'll visualize the first three levels of two trees: Tree 0 (first tree) and Tree 50 (middle of training)

In [None]:
# Plot Tree 0 (first tree)
print("Plotting Tree 0 (first tree, first 3 levels)...")
fig, ax = plt.subplots(figsize=(20, 10))
lgb.plot_tree(model, tree_index=0, figsize=(20, 10), show_info=['split_gain'], ax=ax)
plt.title('Tree 0 - First Tree in Ensemble', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Plot Tree 50 (middle tree)
print("Plotting Tree 50 (middle tree, first 3 levels)...")
fig, ax = plt.subplots(figsize=(20, 10))
lgb.plot_tree(model, tree_index=50, figsize=(20, 10), show_info=['split_gain'], ax=ax)
plt.title('Tree 50 - Middle Tree in Ensemble', fontsize=16)
plt.tight_layout()
plt.show()

## Feature Importance by Tree

Compare feature importance between the two trees

In [None]:
# Get the booster object
booster = model.booster_

# Get importance for specific trees
print("\nFeature splits in Tree 0:")
tree_0_info = booster.trees_to_dataframe().query('tree_index == 0')
if 'split_feature' in tree_0_info.columns:
    tree_0_splits = tree_0_info['split_feature'].value_counts().head(10)
    print(tree_0_splits)
else:
    print("No split information available")

print("\nFeature splits in Tree 50:")
tree_50_info = booster.trees_to_dataframe().query('tree_index == 50')
if 'split_feature' in tree_50_info.columns:
    tree_50_splits = tree_50_info['split_feature'].value_counts().head(10)
    print(tree_50_splits)
else:
    print("No split information available")

## Actual vs Predicted PE Ratios

In [None]:
# Scatter plot of actual vs predicted
plt.figure(figsize=(10, 10))
plt.scatter(y_test, y_pred, alpha=0.3, s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual PE', fontsize=12)
plt.ylabel('Predicted PE', fontsize=12)
plt.title('Actual vs Predicted PE Ratios - November 2025', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate R-squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"\nR-squared: {r2:.4f}")

## Summary Results

In [None]:
# Create results dataframe
results_df = pd.DataFrame({
    'ticker': test_data['ticker'].values,
    'actual_pe': y_test.values,
    'predicted_pe': y_pred,
    'pct_error': pct_error.values
})

print("\nSample of predictions (sorted by absolute percentage error):")
results_df['abs_pct_error'] = np.abs(results_df['pct_error'])
print("\nBest predictions (lowest error):")
print(results_df.nsmallest(10, 'abs_pct_error')[['ticker', 'actual_pe', 'predicted_pe', 'pct_error']])

print("\nWorst predictions (highest error):")
print(results_df.nlargest(10, 'abs_pct_error')[['ticker', 'actual_pe', 'predicted_pe', 'pct_error']])