In [None]:
import pandas as pd
import json
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor

%load_ext autoreload
%autoreload 2

In [None]:
download_old_round = True
target_column_name = 'target'
round_number = 1112
data_folder = 'data_folder'
data_version = 'v5.0'
live_example_preds_filename = 'live_example_preds.parquet'
live_example_round_filename = 'live_example_round.parquet'
train_file_name = 'train.parquet'
validation_example_filename = 'validation_example_round.parquet'
validation_filename = 'validation.parquet'
target_column_name = 'target'
feature_set_chosen =  'small'

feature_metadata = json.load(open(f"{data_folder}/{data_version}/{round_number}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

group_split_list = ['week_no', 'year_horizon']
feature_set = feature_metadata["feature_sets"][feature_set_chosen]
target_set = feature_metadata['targets']
relevant_columns_list = feature_set + [target_column_name] + group_split_list

all_features_list = feature_set + group_split_list

In [None]:
if download_old_round:
    raw_df = pd.read_parquet(
        f"{data_folder}/{data_version}/{round_number}/train.parquet",
        columns=['era'] + feature_set + target_set
    )

raw_df.head()

In [None]:
relevant_columns_list

In [None]:
sliced_df = raw_df.copy(deep=True)

sliced_df['week_no'] = (sliced_df['era'].astype(int) - 1) % 52 + 1
sliced_df['year_horizon'] = (sliced_df['era'].astype(int) - 1) // 52 + 1
sliced_df['era'] = sliced_df['era'].astype(int)

train_df = sliced_df[relevant_columns_list]

In [None]:
# Compute feature correlations with the target
correlations = sliced_df[relevant_columns_list].corr(numeric_only=True)['target'].sort_values(ascending=False)
print(correlations)

In [None]:
group_indices_series = train_df.groupby(group_split_list).ngroup()

gss = GroupShuffleSplit(n_splits=2, test_size=0.7, random_state=42)

for train_idx, test_idx in gss.split(train_df, groups=group_indices_series):
    train_split_df = train_df.iloc[train_idx]
    test_split_df = train_df.iloc[test_idx]

X_train = train_split_df[all_features_list]
y_train = train_split_df[target_column_name]

X_test = test_split_df[all_features_list]
y_test = test_split_df[target_column_name]


In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

r2_score_value_rf = r2_score(y_test, y_pred_rf)
rmse_value_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_value_rf = mean_absolute_error(y_test, y_pred_rf)
mse_value_rf = mean_squared_error(y_test, y_pred_rf)

print(f"Random Forest R2 Score: {r2_score_value_rf:.4f}")
print(f"Random Forest RMSE: {rmse_value_rf:.2f}")
print(f"Random Forest MAE: {mae_value_rf:.2f}")
print(f"Random Forest MSE: {mse_value_rf:.2f}")