In [1]:
import pandas as pd
from numerapi import NumerAPI
import json
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold, cross_val_score
import xlsxwriter

%load_ext autoreload
%autoreload 2

In [2]:
tournament_round_used = 1112
data_version = 'v5.0'
data_folder = 'data_folder'
report_folder = 'reports'
skip = False
feature_set_chosen = 'small'
year_horizon = [1, 2, 3]
target_column_name = 'target'
target_mode = 'single'
current_date = '10_10_2025'
report_name = f'an_attempt_{current_date}.xlsx'
path_to_save_report = f'{report_folder}/{report_name}'

folders_list = [data_folder, report_folder]
for folder in folders_list:
    import os
    if not os.path.exists(folder):
        os.makedirs(folder)

# Data

In [3]:
api = NumerAPI()
api.download_dataset(
	f"{data_version}/live_example_preds.parquet",
	f"{data_folder}/{data_version}/{tournament_round_used}/live_example_round.parquet",
	tournament_round_used
)
api.download_dataset(
	f"{data_version}/validation_example_preds.parquet",
	f"{data_folder}/{data_version}/{tournament_round_used}/validation_example_round.parquet",
	tournament_round_used
)
api.download_dataset(
    f"{data_version}/features.json", 
    f"{data_folder}/{data_version}/{tournament_round_used}/features.json",
    tournament_round_used
)
api.download_dataset(
	f"{data_version}/train.parquet",
	f"{data_folder}/{data_version}/{tournament_round_used}/train.parquet",
	tournament_round_used
)
api.download_dataset(
	f"{data_version}/live_example_preds.parquet",
	f"{data_folder}/{data_version}/{tournament_round_used}/live_example_preds.parquet",
	tournament_round_used
)
api.download_dataset(
	f"{data_version}/validation.parquet",
	f"{data_folder}/{data_version}/{tournament_round_used}/validation.parquet",
	tournament_round_used
)

2025-10-10 05:03:37,557 INFO numerapi.utils: target file already exists
2025-10-10 05:03:37,557 INFO numerapi.utils: download complete
2025-10-10 05:03:38,356 INFO numerapi.utils: target file already exists
2025-10-10 05:03:38,356 INFO numerapi.utils: download complete
2025-10-10 05:03:39,043 INFO numerapi.utils: target file already exists
2025-10-10 05:03:39,045 INFO numerapi.utils: download complete
2025-10-10 05:03:39,777 INFO numerapi.utils: target file already exists
2025-10-10 05:03:39,779 INFO numerapi.utils: download complete
2025-10-10 05:03:40,991 INFO numerapi.utils: target file already exists
2025-10-10 05:03:40,991 INFO numerapi.utils: download complete
2025-10-10 05:03:41,770 INFO numerapi.utils: target file already exists
2025-10-10 05:03:41,771 INFO numerapi.utils: download complete


'data_folder/v5.0/1112/validation.parquet'

# Feature Engineering

In [4]:
feature_metadata = json.load(open(f"{data_folder}/{data_version}/{tournament_round_used}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

target_set = feature_metadata['targets'].copy()
feature_set = feature_metadata["feature_sets"][feature_set_chosen]

raw_train_df = pd.read_parquet(
    f"{data_folder}/{data_version}/{tournament_round_used}/train.parquet",
    columns=['era'] + feature_set + target_set
)

raw_validation_df = pd.read_parquet(
    f"{data_folder}/{data_version}/{tournament_round_used}/validation.parquet",
    columns=['era'] + feature_set + target_set
)

feature_sets 17
targets 37


In [5]:
raw_train_df.shape

(2746270, 80)

In [6]:
sliced_df = raw_train_df.copy()
sliced_df['week_no'] = (sliced_df['era'].astype(int) - 1) % 52 + 1
sliced_df['year_horizon'] = (sliced_df['era'].astype(int) - 1) // 52 + 1
sliced_df['era'] = sliced_df['era'].astype(int)

sliced_df = sliced_df[sliced_df['year_horizon'].isin(year_horizon)]
sliced_df['target'] = sliced_df['target'] * 100

# Modelling

In [7]:
results_list = []
for objective_chosen in target_set[-3:]:
# for objective_chosen in target_set:
    print(f"Objective chosen: {objective_chosen}")

    train_df = sliced_df.drop(columns=target_set)
    test_df = sliced_df[objective_chosen]

    X_train, X_test, y_train, y_test = train_test_split(
        train_df, test_df, test_size=0.2, random_state=42, stratify=train_df['year_horizon']
    )

    xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

    try:
        xgb_model.fit(X_train, y_train)
    except Exception as e:
        print(f"Error fitting model for target {target_column_name}: {e}")

    y_pred_xgb = xgb_model.predict(X_test)

    r2_score_value_xgb = r2_score(y_test, y_pred_xgb)
    rmse_value_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
    mae_value_xgb = mean_absolute_error(y_test, y_pred_xgb)
    mse_value_xgb = mean_squared_error(y_test, y_pred_xgb)

    result = pd.Series({
        "target": objective_chosen,
        "r2_score": r2_score_value_xgb,
        "rmse": rmse_value_xgb,
        "mae": mae_value_xgb,
        "mse": mse_value_xgb
    })

    results_list.append(result)

results_df = pd.concat(results_list, axis=1).T
results_df

Objective chosen: target_xerxes_20
Objective chosen: target_xerxes_60
Objective chosen: target


Unnamed: 0,target,r2_score,rmse,mae,mse
0,target_xerxes_20,0.036277,0.219338,0.157442,0.048109
1,target_xerxes_60,0.093359,0.212259,0.154365,0.045054
2,target,0.037523,21.917837,15.726833,480.391571


In [8]:
cv_obj = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    xgb_model, train_df, test_df, cv=cv_obj, groups=train_df['year_horizon'], scoring='neg_root_mean_squared_error', n_jobs=-1,
)
print("Cross-validation RMSE scores:", -cv_scores)
print("Mean CV RMSE:", -cv_scores.mean())



Cross-validation RMSE scores: [21.9135437  21.91455841 21.96635246 21.80443382 21.88041687]
Mean CV RMSE: 21.895861053466795


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df, test_df, test_size=0.2, random_state=42
)

xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

# Train your model
xgb_model.fit(X_train, y_train)

# Get predictions
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

model_fit_summary_df = pd.DataFrame({
    "set": ["train", "test"],
    "mse": [train_mse, test_mse],
    "rmse": [train_rmse, test_rmse],
    "r2": [train_r2, test_r2]
})

model_fit_summary_formatted_df = model_fit_summary_df.style.format({
    "mse": "{:.4f}",
    "rmse": "{:.4f}",
    "r2": "{:.4f}"
})

print("=== REGRESSION PERFORMANCE ===")
print(f"Training MSE:  {train_mse:.4f}")
print(f"Testing MSE:   {test_mse:.4f}")
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Testing RMSE:  {test_rmse:.4f}")
print(f"Training RÂ²:   {train_r2:.4f}")
print(f"Testing RÂ²:    {test_r2:.4f}")

# Interpretation
if train_mse < test_mse * 0.7:  # Train MSE is much lower
    print("ðŸš¨ OVERFITTING - Model performs much better on training data")
elif train_r2 < 0.6 and test_r2 < 0.6:  # Both RÂ² are low
    print("ðŸš¨ UNDERFITTING - Model performs poorly on both sets")
else:
    print("âœ… GOOD FIT - Reasonable performance on both sets")

# Results

In [9]:
results_formatted_df = results_df.copy()

In [10]:
df_compare = pd.concat([y_test.reset_index(), pd.Series(y_pred_xgb, name='predicted')], axis=1)
df_compare = df_compare.merge(sliced_df.reset_index()[['id', 'week_no', 'year_horizon']], on='id', how='left')

df_compare['error'] = df_compare['target'] - df_compare['predicted']

# Calculate percentage error safely (avoid division by zero)
df_compare['percentage_error'] = np.where(
    df_compare['target'] != 0,
    (df_compare['error'] / df_compare['target']) * 100,
    np.nan
)

# Define bins for % error
bins = [-np.inf, -10, -5, 5, 10, np.inf]
labels = ['Extreme Underforecast', 'Underforecast', 'Accurate', 'Overforecast', 'Extreme Overforecast']

# Assign forecast for nonzero targets
df_compare.loc[df_compare['target'] != 0, 'forecast'] = pd.cut(
    df_compare.loc[df_compare['target'] != 0, 'percentage_error'],
    bins=bins,
    labels=labels
)

# Handle zero-target cases explicitly
abs_tol_accurate = 0.01  # adjust based on your data scale
abs_tol_extreme = 0.1

zero_mask = df_compare['target'] == 0

df_compare.loc[zero_mask & (df_compare['predicted'].abs() <= abs_tol_accurate), 'forecast'] = 'Accurate'
df_compare.loc[zero_mask & (df_compare['predicted'] > abs_tol_accurate) & (df_compare['predicted'] <= abs_tol_extreme), 'forecast'] = 'Overforecast'
df_compare.loc[zero_mask & (df_compare['predicted'] > abs_tol_extreme), 'forecast'] = 'Extreme Overforecast'
df_compare.loc[zero_mask & (df_compare['predicted'] < -abs_tol_accurate) & (df_compare['predicted'] >= -abs_tol_extreme), 'forecast'] = 'Underforecast'
df_compare.loc[zero_mask & (df_compare['predicted'] < -abs_tol_extreme), 'forecast'] = 'Extreme Underforecast'


In [11]:
week_comparison = df_compare.groupby(['forecast', 'week_no']).agg({'id': 'nunique'}).reset_index().pivot(index='forecast', columns='week_no', values='id').fillna(0).T
week_comparison_formatted_df = week_comparison.applymap(lambda x: f"{x:,}")
week_comparison_formatted_df.head()

  week_comparison = df_compare.groupby(['forecast', 'week_no']).agg({'id': 'nunique'}).reset_index().pivot(index='forecast', columns='week_no', values='id').fillna(0).T
  week_comparison_formatted_df = week_comparison.applymap(lambda x: f"{x:,}")


forecast,Extreme Underforecast,Underforecast,Accurate,Overforecast,Extreme Overforecast
week_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,490,147,562,137,687
2,485,158,542,149,716
3,466,150,569,168,740
4,516,143,643,164,725
5,493,143,675,152,739


In [12]:
forecast_summary_df = df_compare.groupby('forecast').agg(**{'count': ('id', 'nunique')})
forecast_summary_formatted_df = forecast_summary_df.copy()
forecast_summary_formatted_df['population_share'] = forecast_summary_formatted_df['count'] / forecast_summary_formatted_df['count'].sum() * 100
forecast_summary_formatted_df['count'] = forecast_summary_formatted_df['count'].apply(lambda x: f"{x:,}")
forecast_summary_formatted_df['population_share'] = round(forecast_summary_formatted_df['population_share'], 2)
forecast_summary_formatted_df

  forecast_summary_df = df_compare.groupby('forecast').agg(**{'count': ('id', 'nunique')})


Unnamed: 0_level_0,count,population_share
forecast,Unnamed: 1_level_1,Unnamed: 2_level_1
Extreme Underforecast,27760,22.76
Underforecast,7827,6.42
Accurate,39069,32.03
Overforecast,7800,6.39
Extreme Overforecast,39531,32.41


# Report

In [13]:
# Collect all variables ending with '_summary_df'
summary_vars = [var for var in globals() if var.endswith('_formatted_df')]

# Create a new Excel file
with xlsxwriter.Workbook(path_to_save_report) as workbook:
    for var in summary_vars:
        df = globals()[var]
        worksheet = workbook.add_worksheet(var)
        # Write column headers
        for col_num, col_name in enumerate(df.columns.insert(0, df.index.name or 'index')):
            worksheet.write(0, col_num, col_name)
        # Write data rows
        for row_num, (idx, row) in enumerate(df.iterrows(), start=1):
            worksheet.write(row_num, 0, idx)
            for col_num, value in enumerate(row, start=1):
                worksheet.write(row_num, col_num, value)