
# Model Evaluation 


In [6]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, roc_curve, confusion_matrix

OUT_DIR = Path('../data/predictions')
DB_PATH = Path('../data/data.db')
MODELS_REG = ['linreg','ridge','rf','xgb']
MODELS_CLS = ['logit','rf','xgb']

def available_files(task: str, models: list[str], out_dir: Path) -> dict[str, dict[str, Path]]:
    found = {}
    for m in models:
        pred = out_dir / f'predictions_{task}_{m}.csv'
        met  = out_dir / f'metrics_{task}_{m}.csv'
        fi   = out_dir / f'feature_importances_{task}_{m}.csv'
        if pred.exists():
            found[m] = {'pred': pred if pred.exists() else None,
                        'met':  met  if met.exists() else None,
                        'fi':   fi   if fi.exists() else None}
    return found

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


In [7]:

reg_files = available_files('regression', MODELS_REG, OUT_DIR)
cls_files = available_files('classification', MODELS_CLS, OUT_DIR)

print('Found regression models:', list(reg_files.keys()))
print('Found classification models:', list(cls_files.keys()))


Found regression models: ['linreg', 'ridge', 'rf', 'xgb']
Found classification models: ['logit', 'rf', 'xgb']


In [8]:

def summarize_regression_predictions(pred_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Return overall and per-ticker summary (MAE, RMSE, R2, Hit, IC)."""
    # Drop rows without y_true
    df = pred_df.dropna(subset=['y_true']).copy()
    # Overall
    overall = {
        'n_samples': len(df),
        'MAE': mean_absolute_error(df['y_true'], df['y_pred']),
        'RMSE': rmse(df['y_true'], df['y_pred']),
        'R2': r2_score(df['y_true'], df['y_pred']) if len(df) >= 2 and df['y_true'].var() > 0 else np.nan,
        'Hit': (np.sign(df['y_true']) == np.sign(df['y_pred'])).mean(),
        'IC':  np.corrcoef(df['y_true'], df['y_pred'])[0,1] if len(df) >= 2 else np.nan,
    }
    overall_df = pd.DataFrame([overall])

    # Per-ticker
    per = []
    for t, g in df.groupby('ticker'):
        per.append({
            'ticker': t,
            'n_samples': len(g),
            'MAE': mean_absolute_error(g['y_true'], g['y_pred']),
            'RMSE': rmse(g['y_true'], g['y_pred']),
            'R2': r2_score(g['y_true'], g['y_pred']) if len(g) >= 2 and g['y_true'].var() > 0 else np.nan,
            'Hit': (np.sign(g['y_true']) == np.sign(g['y_pred'])).mean(),
            'IC':  np.corrcoef(g['y_true'], g['y_pred'])[0,1] if len(g) >= 2 else np.nan,
        })
    per_df = pd.DataFrame(per).sort_values('RMSE')
    return overall_df, per_df


def summarize_classification_predictions(pred_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Return overall and per-ticker summary (Accuracy, F1, Precision, Recall, AUC if probs)."""
    cols_needed = ['y_pred_cls']
    has_prob = 'y_prob' in pred_df.columns
    if has_prob:
        cols_needed.append('y_prob')
    df = pred_df.dropna(subset=['y_true_cls'] + cols_needed).copy()

    # Overall
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    overall = {
        'n_samples': len(df),
        'Accuracy': accuracy_score(df['y_true_cls'], df['y_pred_cls']),
        'Precision': precision_score(df['y_true_cls'], df['y_pred_cls'], zero_division=0),
        'Recall': recall_score(df['y_true_cls'], df['y_pred_cls'], zero_division=0),
        'F1': f1_score(df['y_true_cls'], df['y_pred_cls'], zero_division=0),
        'AUC': roc_auc_score(df['y_true_cls'], df['y_prob']) if has_prob and df['y_true_cls'].nunique()==2 else np.nan,
    }
    overall_df = pd.DataFrame([overall])

    # Per-ticker
    per = []
    for t, g in df.groupby('ticker'):
        d = {
            'ticker': t,
            'n_samples': len(g),
            'Accuracy': accuracy_score(g['y_true_cls'], g['y_pred_cls']),
            'Precision': precision_score(g['y_true_cls'], g['y_pred_cls'], zero_division=0),
            'Recall': recall_score(g['y_true_cls'], g['y_pred_cls'], zero_division=0),
            'F1': f1_score(g['y_true_cls'], g['y_pred_cls'], zero_division=0),
            'AUC': roc_auc_score(g['y_true_cls'], g['y_prob']) if has_prob and g['y_true_cls'].nunique()==2 else np.nan,
        }
        per.append(d)
    per_df = pd.DataFrame(per).sort_values('Accuracy', ascending=False)
    return overall_df, per_df


## Regression

In [9]:

for model, paths in reg_files.items():
    preds = pd.read_csv(paths['pred'], parse_dates=['date'])
    overall, per = summarize_regression_predictions(preds)

    print(f"\n=== Regression: {model} ===")
    display(overall)
    display(per.head(15))



=== Regression: linreg ===


Unnamed: 0,n_samples,MAE,RMSE,R2,Hit,IC
0,972,0.113574,0.169926,-19.773314,0.498971,0.019771


Unnamed: 0,ticker,n_samples,MAE,RMSE,R2,Hit,IC
4,XLK,108,0.078552,0.111228,-18.43729,0.453704,0.029832
3,XLI,108,0.081102,0.114225,-21.507658,0.472222,0.001611
2,XLF,108,0.10275,0.135444,-16.574608,0.444444,-0.075426
8,XLY,108,0.084442,0.13902,-23.802909,0.472222,0.066315
7,XLV,108,0.09785,0.141347,-23.593758,0.518519,-0.021654
0,XLB,108,0.121661,0.160723,-31.514013,0.490741,0.005482
5,XLP,108,0.127399,0.197158,-38.070945,0.546296,0.047057
6,XLU,108,0.143981,0.216176,-23.444409,0.583333,0.065378
1,XLE,108,0.184427,0.255675,-12.419455,0.509259,0.016186



=== Regression: ridge ===


Unnamed: 0,n_samples,MAE,RMSE,R2,Hit,IC
0,972,0.032555,0.044598,-0.430897,0.506173,0.016361


Unnamed: 0,ticker,n_samples,MAE,RMSE,R2,Hit,IC
3,XLI,108,0.02383,0.030456,-0.60015,0.490741,-0.199394
4,XLK,108,0.026038,0.030838,-0.494136,0.416667,-0.041357
8,XLY,108,0.023346,0.03265,-0.36813,0.509259,-0.129701
7,XLV,108,0.025163,0.033511,-0.382391,0.537037,-0.031621
0,XLB,108,0.027803,0.037947,-0.812507,0.555556,-0.086322
5,XLP,108,0.030634,0.041275,-0.712405,0.546296,-0.11352
2,XLF,108,0.032698,0.041619,-0.659427,0.453704,-0.089083
6,XLU,108,0.045699,0.056212,-0.65284,0.518519,-0.002131
1,XLE,108,0.057784,0.07614,-0.190084,0.527778,0.152338



=== Regression: rf ===


Unnamed: 0,n_samples,MAE,RMSE,R2,Hit,IC
0,972,0.028138,0.040103,-0.15702,0.50823,-0.061885


Unnamed: 0,ticker,n_samples,MAE,RMSE,R2,Hit,IC
3,XLI,108,0.02013,0.026139,-0.178674,0.5,-0.151906
4,XLK,108,0.022462,0.027225,-0.164492,0.407407,-0.11678
8,XLY,108,0.021691,0.030265,-0.175513,0.574074,-0.131055
7,XLV,108,0.023362,0.030555,-0.149295,0.490741,-0.103066
0,XLB,108,0.023017,0.030871,-0.199535,0.555556,-0.081398
2,XLF,108,0.025545,0.033943,-0.103741,0.518519,0.028845
5,XLP,108,0.026476,0.034484,-0.195245,0.509259,-0.164037
6,XLU,108,0.036696,0.045171,-0.067308,0.537037,0.033852
1,XLE,108,0.053863,0.076587,-0.204113,0.481481,-0.097509



=== Regression: xgb ===


Unnamed: 0,n_samples,MAE,RMSE,R2,Hit,IC
0,972,0.030991,0.04335,-0.351972,0.493827,-0.047069


Unnamed: 0,ticker,n_samples,MAE,RMSE,R2,Hit,IC
3,XLI,108,0.022817,0.029081,-0.458877,0.435185,-0.119961
4,XLK,108,0.0244,0.030492,-0.460803,0.425926,-0.13466
8,XLY,108,0.023032,0.031455,-0.269786,0.555556,-0.105747
0,XLB,108,0.024879,0.032655,-0.342206,0.537037,-0.036149
7,XLV,108,0.026914,0.03471,-0.483044,0.490741,-0.183785
2,XLF,108,0.028076,0.038058,-0.387581,0.537037,0.013174
5,XLP,108,0.031643,0.040378,-0.638784,0.435185,-0.309937
6,XLU,108,0.038913,0.047994,-0.204847,0.555556,0.131258
1,XLE,108,0.058242,0.080591,-0.333318,0.472222,-0.062312


## Classification 

In [10]:

for model, paths in cls_files.items():
    preds = pd.read_csv(paths['pred'], parse_dates=['date'])
    overall, per = summarize_classification_predictions(preds)

    print(f"\n=== Classification: {model} ===")
    display(overall)
    display(per.head(15))



=== Classification: logit ===


Unnamed: 0,n_samples,Accuracy,Precision,Recall,F1,AUC
0,972,0.5,0.46696,0.464912,0.465934,0.499197


Unnamed: 0,ticker,n_samples,Accuracy,Precision,Recall,F1,AUC
7,XLV,108,0.583333,0.509091,0.608696,0.554455,0.596424
0,XLB,108,0.537037,0.461538,0.382979,0.418605,0.546216
1,XLE,108,0.527778,0.452381,0.404255,0.426966,0.518661
3,XLI,108,0.509259,0.509804,0.481481,0.495238,0.446845
4,XLK,108,0.5,0.53125,0.586207,0.557377,0.494828
8,XLY,108,0.472222,0.422222,0.38,0.4,0.47
5,XLP,108,0.462963,0.347826,0.363636,0.355556,0.443537
2,XLF,108,0.453704,0.490909,0.465517,0.477876,0.440345
6,XLU,108,0.453704,0.438596,0.480769,0.458716,0.449863



=== Classification: rf ===


Unnamed: 0,n_samples,Accuracy,Precision,Recall,F1,AUC
0,972,0.516461,0.484091,0.467105,0.475446,0.504284


Unnamed: 0,ticker,n_samples,Accuracy,Precision,Recall,F1,AUC
0,XLB,108,0.611111,0.564103,0.468085,0.511628,0.617893
1,XLE,108,0.564815,0.5,0.446809,0.47191,0.506278
8,XLY,108,0.546296,0.510638,0.48,0.494845,0.523276
7,XLV,108,0.527778,0.44186,0.413043,0.426966,0.527349
4,XLK,108,0.518519,0.546875,0.603448,0.57377,0.502414
5,XLP,108,0.518519,0.394737,0.340909,0.365854,0.440518
2,XLF,108,0.481481,0.517241,0.517241,0.517241,0.473793
6,XLU,108,0.444444,0.425926,0.442308,0.433962,0.416896
3,XLI,108,0.435185,0.436364,0.444444,0.440367,0.419067



=== Classification: xgb ===


Unnamed: 0,n_samples,Accuracy,Precision,Recall,F1,AUC
0,972,0.495885,0.461712,0.449561,0.455556,0.505427


Unnamed: 0,ticker,n_samples,Accuracy,Precision,Recall,F1,AUC
0,XLB,108,0.583333,0.525,0.446809,0.482759,0.58842
1,XLE,108,0.509259,0.431818,0.404255,0.417582,0.537496
2,XLF,108,0.509259,0.539683,0.586207,0.561983,0.523103
5,XLP,108,0.509259,0.371429,0.295455,0.329114,0.411222
7,XLV,108,0.509259,0.433962,0.5,0.464646,0.543478
4,XLK,108,0.5,0.53125,0.586207,0.557377,0.512414
8,XLY,108,0.481481,0.44,0.44,0.44,0.493103
6,XLU,108,0.435185,0.4,0.346154,0.371134,0.421703
3,XLI,108,0.425926,0.42,0.388889,0.403846,0.433128
