In [None]:
import pandas as pd
from numerapi import NumerAPI
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold, cross_val_score

%load_ext autoreload
%autoreload 2

In [None]:
data_version = 'v5.0'
data_folder = 'data_folder'
skip = True
use_eras = True
naive_split_bool = True
target_column_name = 'target'
feature_set_chosen =  'all'

# 1 - Data

In [None]:
api = NumerAPI()
api.download_dataset(
	f"{data_version}/train.parquet",
	f"{data_folder}/{data_version}train.parquet"
)
api.download_dataset(
	f"{data_version}/live_example_preds.parquet",
	f"{data_folder}/{data_version}/live_example_preds.parquet"
)
api.download_dataset(
	f"{data_version}/validation_example_preds.parquet",
	f"{data_folder}/{data_version}/validation_example_preds.parquet"
)
api.download_dataset(
	f"{data_version}/live_example_preds.parquet",
	f"{data_folder}/{data_version}/live_example_preds.parquet"
)

if not skip:
	api.download_dataset(f"{data_folder}/{data_version}/features.json")

feature_metadata = json.load(open(f"{data_folder}/{data_version}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

In [None]:
feature_metadata.keys(), len(feature_metadata['feature_sets']), len(feature_metadata['targets']), len(feature_metadata['feature_sets']['small']), len(feature_metadata['feature_sets']['medium']), len(feature_metadata['feature_sets']['all'])

In [None]:
feature_metadata['feature_sets'].keys()

In [None]:
feature_metadata.keys()

## 2. Feature Set Step - 'raw_df' - eg small

## 1. Cloud Step

In [None]:
# Define our feature set
feature_set = feature_metadata["feature_sets"][feature_set_chosen]
target_set = feature_metadata['targets']

raw_df = pd.read_parquet(
    f"{data_folder}/{data_version}/train.parquet",
    columns=['era'] + feature_set + target_set
)

raw_df.head()

In [None]:
raw_df.index.duplicated().sum()

In [None]:
raw_df.era.value_counts()

In [None]:
# # Compute feature correlations with the target
# correlations = raw_df.corr(numeric_only=True)['target'].sort_values(ascending=False)
# print(correlations)

In [None]:
print(sum('target' in name for name in feature_metadata['targets']))

In [None]:
raw_df.columns[raw_df.columns.str.contains('target')]

# 2 -  Objective setting - Splitting for train test

Feature additions

In [None]:
group_split_list = ['week_no', 'year_horizon']

sliced_df = raw_df.copy(deep=True)

sliced_df['week_no'] = (sliced_df['era'].astype(int) - 1) % 52 + 1
sliced_df['year_horizon'] = (sliced_df['era'].astype(int) - 1) // 52 + 1
sliced_df['era'] = sliced_df['era'].astype(int)

train_df = raw_df.drop(columns=target_set)
test_df = raw_df['target']

train_df.shape, test_df.shape

# 3) Modelling

## Baseline

Simple

In [None]:
# relevant_target_columns_without_target = target_set[target_set != 'target']

# train_df = raw_df.drop(columns=target_set)
# test_df = raw_df.drop(columns=target_column_name).drop(columns=relevant_target_columns_without_target)

# train_df.shape, test_df.shape

In [None]:
# target_set.remove('target')

In [None]:
target_column_name = 'target'

lr_model = LinearRegression()

cv_scores = cross_val_score(lr_model, train_df, test_df, cv=3, scoring='neg_mean_squared_error')

# Convert negative MSE to positive and calculate RMSE
mse_scores = -cv_scores
rmse_scores = np.sqrt(mse_scores)

print(f"Cross-Validation MSE scores: {mse_scores}")
print(f"Mean MSE: {mse_scores.mean():.4f} (+/- {mse_scores.std() * 2:.4f})")
print(f"Cross-Validation RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean():.4f} (+/- {rmse_scores.std() * 2:.4f})")

In [None]:
# Feature train target train, feature test target test
X_train = train_df.drop(columns=[target_set])
y_train = train_df['target']
X_test = test_df.drop(columns=[target_set])
y_test = test_df['target']

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)

# Metrics
r2_score_value_lr = r2_score(y_test, y_pred_lr)
rmse_value_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_value_lr = mean_absolute_error(y_test, y_pred_lr)
mse_value_lr = mean_squared_error(y_test, y_pred_lr)

print(f"R2 Score: {r2_score_value_lr:.4f}")
print(f"RMSE: {rmse_value_lr:.2f}")
print(f"MAE: {mae_value_lr:.2f}")
print(f"MSE: {mse_value_lr:.2f}")

Iterating

In [None]:
# results = []

# for temp_target_column_name in all_columns_contiaining_target:
#     print(f"Processing target column: {temp_target_column_name}")

#     temp_target_column_name_to_filter_list = relevant_columns_without_target[relevant_columns_without_target != temp_target_column_name]

#     temp_X_train = train_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
#     temp_y_train = train_df[temp_target_column_name]
#     temp_X_test = test_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
#     temp_y_test = test_df[temp_target_column_name]

#     lr_model = LinearRegression()
#     try:
#         lr_model.fit(temp_X_train, temp_y_train)
#     except Exception as e:
#         print(f"Error fitting model for target {temp_target_column_name}: {e}")
#         continue
#     y_pred_lr = lr_model.predict(temp_X_test)

#     r2_score_value_lr = r2_score(temp_y_test, y_pred_lr)
#     rmse_value_lr = np.sqrt(mean_squared_error(temp_y_test, y_pred_lr))
#     mae_value_lr = mean_absolute_error(temp_y_test, y_pred_lr)
#     mse_value_lr = mean_squared_error(temp_y_test, y_pred_lr)

#     result = pd.Series({
#         "target": temp_target_column_name,
#         "r2_score": r2_score_value_lr,
#         "rmse": rmse_value_lr,
#         "mae": mae_value_lr,
#         "mse": mse_value_lr
#     })
#     results.append(result)

# linear_results_df = pd.DataFrame(results)
# linear_results_df.head()

## Experiments

Random Forest

In [None]:
# # Train Random Forest
# rf_model = RandomForestRegressor()
# rf_model.fit(X_train, y_train)

# # Predict
# y_pred_rf = rf_model.predict(X_test)

# # Metrics
# r2_score_value_rf = r2_score(y_test, y_pred_rf)
# rmse_value_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
# mae_value_rf = mean_absolute_error(y_test, y_pred_rf)
# mse_value_rf = mean_squared_error(y_test, y_pred_rf)

# print(f"Random Forest R2 Score: {r2_score_value_rf:.4f}")
# print(f"Random Forest RMSE: {rmse_value_rf:.2f}")
# print(f"Random Forest MAE: {mae_value_rf:.2f}")
# print(f"Random Forest MSE: {mse_value_rf:.2f}")

Neural Network

In [None]:
# target_column_name = 'target'

# # Feature train target train, feature test target test
# X_train = train_df.drop(columns=[target_column_name]).drop(columns=relevant_columns_without_target)
# y_train = train_df[target_column_name]
# X_test = test_df.drop(columns=[target_column_name]).drop(columns=relevant_columns_without_target)
# y_test = test_df[target_column_name]

# # Train Neural Network
# nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), random_state=42, max_iter=500)
# nn_model.fit(train_df.drop(columns=['target']), y_train)

# # Predict
# y_pred_nn = nn_model.predict(test_df.drop(columns=['target']))

# # Metrics
# r2_score_value_nn = r2_score(y_test, y_pred_nn)
# rmse_value_nn = np.sqrt(mean_squared_error(y_test, y_pred_nn))
# mae_value_nn = mean_absolute_error(y_test, y_pred_nn)
# mse_value_nn = mean_squared_error(y_test, y_pred_nn)

# print(f"Neural Network R2 Score: {r2_score_value_nn:.4f}")
# print(f"Neural Network RMSE: {rmse_value_nn:.2f}")
# print(f"Neural Network MAE: {mae_value_nn:.2f}")
# print(f"Neural Network MSE: {mse_value_nn:.2f}")

LGBM

In [None]:
# target_column_name = 'target'

# # Feature train target train, feature test target test
# X_train = train_df.drop(columns=[target_column_name]).drop(columns=relevant_columns_without_target)
# y_train = train_df[target_column_name]
# X_test = test_df.drop(columns=[target_column_name]).drop(columns=relevant_columns_without_target)
# y_test = test_df[target_column_name]

# # Train LGBMRegressor
# lgbm_model = LGBMRegressor(random_state=42)
# lgbm_model.fit(train_df.drop(columns=['target']), y_train)

# # Predict
# y_pred_lgbm = lgbm_model.predict(test_df.drop(columns=['target']))

# # Metrics
# r2_score_value_lgbm = r2_score(y_test, y_pred_lgbm)
# rmse_value_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
# mae_value_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
# mse_value_lgbm = mean_squared_error(y_test, y_pred_lgbm)

# print(f"LGBMRegressor R2 Score: {r2_score_value_lgbm:.4f}")
# print(f"LGBMRegressor RMSE: {rmse_value_lgbm:.2f}")
# print(f"LGBMRegressor MAE: {mae_value_lgbm:.2f}")
# print(f"LGBMRegressor MSE: {mse_value_lgbm:.2f}")

In [None]:
# results_lgbm = []

# for temp_target_column_name in all_columns_contiaining_target:
#     print(f"Processing target column: {temp_target_column_name}")

#     temp_target_column_name_to_filter_list = relevant_columns_without_target[relevant_columns_without_target != temp_target_column_name]

#     temp_X_train = train_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
#     temp_y_train = train_df[temp_target_column_name]
#     temp_X_test = test_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
#     temp_y_test = test_df[temp_target_column_name]

#     lgbm_model = LGBMRegressor(random_state=42)
#     try:
#         lgbm_model.fit(temp_X_train, temp_y_train)
#     except Exception as e:
#         print(f"Error fitting model for target {temp_target_column_name}: {e}")
#         continue
#     y_pred_lgbm = lgbm_model.predict(temp_X_test)

#     try:
#         r2_score_value_lgbm = r2_score(temp_y_test, y_pred_lgbm)
#         rmse_value_lgbm = np.sqrt(mean_squared_error(temp_y_test, y_pred_lgbm))
#         mae_value_lgbm = mean_absolute_error(temp_y_test, y_pred_lgbm)
#         mse_value_lgbm = mean_squared_error(temp_y_test, y_pred_lgbm)
#     except Exception as e:
#         print(f"Error calculating metrics for target {temp_target_column_name}: {e}")
#         continue

#     result = pd.Series({
#         "target": temp_target_column_name,
#         "r2_score": r2_score_value_lgbm,
#         "rmse": rmse_value_lgbm,
#         "mae": mae_value_lgbm,
#         "mse": mse_value_lgbm
#     })
#     results_lgbm.append(result)

# results_lgbm_df = pd.DataFrame(results_lgbm)
# results_lgbm_df

In [None]:
# results_lgbm_df

In [None]:
# Set target_mode to 'single' for just 'target', or 'all' for all columns
target_mode = 'single'  # or 'all'

if target_mode == 'single':
    target_columns = ['target']
else:
    target_columns = all_columns_contiaining_target

results_xgb = []

for temp_target_column_name in target_columns:
    print(f"Processing target column: {temp_target_column_name}")

    temp_target_column_name_to_filter_list = relevant_columns_without_target[relevant_columns_without_target != temp_target_column_name]

    temp_X_train = train_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
    temp_y_train = train_df[temp_target_column_name]
    temp_X_test = test_df.drop(columns=temp_target_column_name_to_filter_list).drop(columns='target')
    temp_y_test = test_df[temp_target_column_name]

    xgb_model = XGBRegressor(random_state=42, n_jobs=-1)
    try:
        xgb_model.fit(temp_X_train, temp_y_train)
    except Exception as e:
        print(f"Error fitting model for target {temp_target_column_name}: {e}")
        continue
    y_pred_xgb = xgb_model.predict(temp_X_test)

    r2_score_value_xgb = r2_score(temp_y_test, y_pred_xgb)
    rmse_value_xgb = np.sqrt(mean_squared_error(temp_y_test, y_pred_xgb))
    mae_value_xgb = mean_absolute_error(temp_y_test, y_pred_xgb)
    mse_value_xgb = mean_squared_error(temp_y_test, y_pred_xgb)

    result = pd.Series({
        "target": temp_target_column_name,
        "r2_score": r2_score_value_xgb,
        "rmse": rmse_value_xgb,
        "mae": mae_value_xgb,
        "mse": mse_value_xgb
    })
    results_xgb.append(result)

results_xgb_df = pd.DataFrame(results_xgb)
results_xgb_df.head()

In [None]:
results_xgb_df