Meta Learning

NumerAi

Tanmay Singh
2021569
CSAI
Class of '25

In [1]:
import os
import gc
import time
import json
import pickle
import numpy as np
import pandas as pd
import seaborn as sb
import xgboost as xgb
import lightgbm as lgb
import cloudpickle as cp
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [4]:
from torchsummary import summary
from torch.utils.data import DataLoader, Dataset

In [5]:
from tqdm.auto import tqdm
from scipy import stats
from numerapi import NumerAPI
from scipy.stats import pearsonr
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [6]:
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.neural_network import *
from sklearn.model_selection import *
from sklearn.cluster._kmeans import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Creating a Feature Set

In [10]:
feature_metadata = json.load(open(f"./data/v5.0/features.json"))

for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

feature_sets 17
targets 37


In [11]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 705
all 2376


In [12]:
feature_sets = feature_metadata["feature_sets"]
feature_sets.keys()

dict_keys(['small', 'medium', 'all', 'v2_equivalent_features', 'v3_equivalent_features', 'fncv3_features', 'intelligence', 'charisma', 'strength', 'dexterity', 'constitution', 'wisdom', 'agility', 'serenity', 'sunshine', 'rain', 'midnight'])

In [13]:
for feature_set in feature_sets:
  print(f'Feature Set: {feature_set:<25}', f'Size: {len(feature_sets[feature_set])}')

Feature Set: small                     Size: 42
Feature Set: medium                    Size: 705
Feature Set: all                       Size: 2376
Feature Set: v2_equivalent_features    Size: 304
Feature Set: v3_equivalent_features    Size: 1000
Feature Set: fncv3_features            Size: 400
Feature Set: intelligence              Size: 35
Feature Set: charisma                  Size: 290
Feature Set: strength                  Size: 135
Feature Set: dexterity                 Size: 51
Feature Set: constitution              Size: 335
Feature Set: wisdom                    Size: 140
Feature Set: agility                   Size: 145
Feature Set: serenity                  Size: 95
Feature Set: sunshine                  Size: 325
Feature Set: rain                      Size: 666
Feature Set: midnight                  Size: 244


Loading the Saved Experts & the Meta-Model

In [14]:
with open('./saved_models/numerai_expert1.pkl', 'rb') as f:
    expert1 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [15]:
with open('./saved_models/numerai_expert2.pkl', 'rb') as f:
    expert2 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [16]:
with open('./saved_models/numerai_expert3.pkl', 'rb') as f:
    expert3 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [17]:
with open('./saved_models/numerai_expert4.pkl', 'rb') as f:
    expert4 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [18]:
with open('./saved_models/numerai_expert5.pkl', 'rb') as f:
    expert5 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [19]:
with open('./saved_models/numerai_expert6.pkl', 'rb') as f:
    expert6 = pickle.load(f)
print("Model loaded successfully!")

Model loaded successfully!


In [20]:
with open('./saved_models/numerai_meta_model.pkl', 'rb') as f:
    meta_model = pickle.load(f)
print("Meta Model loaded successfully!")

Meta Model loaded successfully!


Loading the Validation Set, with a 'medium' feature set

In [22]:
feature_set = feature_sets["medium"]

val = pd.read_parquet(
    f"./data/v5.0/validation.parquet",
    columns=["era", "target"] + feature_set
)


Preprocessing the Validation Set (in the same manner as the Training Set)

In [23]:
val.rename(columns=lambda x: f'feature {feature_set.index(x)}' if x in feature_set else x, inplace=True)
feature_set = val.columns.drop(["era", "target"])

In [24]:
val['era'] = val['era'].astype('int32')

In [25]:
val

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffd4d6d6d717308,1140,,2,4,4,2,1,4,1,1,...,4,0,3,3,1,3,2,3,3,2
nffe0260d4bc093e,1140,,4,4,3,0,1,4,4,3,...,0,2,3,3,3,2,1,1,4,1
nffe6044b10ffebe,1140,,1,4,1,2,2,4,3,1,...,1,3,2,3,4,2,1,3,3,2
nfff646397011d0d,1140,,4,1,0,1,3,0,0,0,...,1,2,1,4,1,3,4,2,2,2


In [26]:
val.isna().any().any()

True

In [27]:
val = val.dropna(subset=['target'])
val

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc7d5c3bb883aa,1135,0.25,0,3,1,3,0,4,3,0,...,3,2,0,4,4,4,1,0,3,1
nffd2ddf669b3c4c,1135,0.25,0,0,3,1,1,1,3,1,...,4,2,3,1,4,1,2,3,1,3
nffd838736b2cb68,1135,0.50,0,3,3,3,4,1,3,2,...,1,4,0,1,2,3,4,0,3,2
nffea35973af0581,1135,0.25,1,2,2,2,1,1,3,3,...,4,3,2,4,3,3,0,3,3,3


In [28]:
unique_era = val['era'].unique()

In [None]:
val[val['era'] == unique_era[0]]

In [29]:
test_set = val
test_set

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc7d5c3bb883aa,1135,0.25,0,3,1,3,0,4,3,0,...,3,2,0,4,4,4,1,0,3,1
nffd2ddf669b3c4c,1135,0.25,0,0,3,1,1,1,3,1,...,4,2,3,1,4,1,2,3,1,3
nffd838736b2cb68,1135,0.50,0,3,3,3,4,1,3,2,...,1,4,0,1,2,3,4,0,3,2
nffea35973af0581,1135,0.25,1,2,2,2,1,1,3,3,...,4,3,2,4,3,3,0,3,3,3


In [30]:
test_set.isna().any().any()
test_set['target'].value_counts()

target
0.50    1732127
0.25     695052
0.75     694503
1.00     171861
0.00     171502
Name: count, dtype: int64

Encoding the Numeric Values in the Target into corresponding labels (class 0 to class 4)

In [31]:
label_encoder = LabelEncoder()
label_encoder.fit(test_set['target'])
test_set['target'] = label_encoder.transform(test_set['target'])

In [32]:
test_df_x = test_set.drop(['target'], axis=1, inplace=False)
test_df_y = test_set['target']

In [33]:
test_df_x

Unnamed: 0_level_0,era,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,2,1,2,4,0,4,0,0,2,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0,2,2,3,2,3,0,0,4,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0,2,2,3,3,3,1,1,2,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0,1,1,4,3,4,0,4,0,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,4,2,0,0,3,3,3,4,2,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc7d5c3bb883aa,1135,0,3,1,3,0,4,3,0,0,...,3,2,0,4,4,4,1,0,3,1
nffd2ddf669b3c4c,1135,0,0,3,1,1,1,3,1,4,...,4,2,3,1,4,1,2,3,1,3
nffd838736b2cb68,1135,0,3,3,3,4,1,3,2,1,...,1,4,0,1,2,3,4,0,3,2
nffea35973af0581,1135,1,2,2,2,1,1,3,3,2,...,4,3,2,4,3,3,0,3,3,3


In [34]:
test_df_y

id
n000101811a8a843    3
n001e1318d5072ac    0
n002a9c5ab785cbb    2
n002ccf6d0e8c5ad    0
n0041544c345c91d    2
                   ..
nffc7d5c3bb883aa    1
nffd2ddf669b3c4c    1
nffd838736b2cb68    2
nffea35973af0581    1
nfff17061c841a61    3
Name: target, Length: 3465045, dtype: int64

In [35]:
test_df_x_resampled = test_df_x

test_df_x_resampled

Unnamed: 0_level_0,era,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,2,1,2,4,0,4,0,0,2,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0,2,2,3,2,3,0,0,4,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0,2,2,3,3,3,1,1,2,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0,1,1,4,3,4,0,4,0,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,4,2,0,0,3,3,3,4,2,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc7d5c3bb883aa,1135,0,3,1,3,0,4,3,0,0,...,3,2,0,4,4,4,1,0,3,1
nffd2ddf669b3c4c,1135,0,0,3,1,1,1,3,1,4,...,4,2,3,1,4,1,2,3,1,3
nffd838736b2cb68,1135,0,3,3,3,4,1,3,2,1,...,1,4,0,1,2,3,4,0,3,2
nffea35973af0581,1135,1,2,2,2,1,1,3,3,2,...,4,3,2,4,3,3,0,3,3,3


In [36]:
test_df_y_resampled = test_df_y

test_df_y_resampled

id
n000101811a8a843    3
n001e1318d5072ac    0
n002a9c5ab785cbb    2
n002ccf6d0e8c5ad    0
n0041544c345c91d    2
                   ..
nffc7d5c3bb883aa    1
nffd2ddf669b3c4c    1
nffd838736b2cb68    2
nffea35973af0581    1
nfff17061c841a61    3
Name: target, Length: 3465045, dtype: int64

Function to compute Label Frequencies

In [37]:
def label_frequency(predictions):
    unique, counts = np.unique(predictions, return_counts=True)
    label_frequencies = dict(zip(unique, counts))
    print("Label frequencies:", label_frequencies)

Function to compute NumerAi Correlation

In [38]:
def numerai_corr(preds, target):
  ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
  gauss_ranked_preds = stats.norm.ppf(ranked_preds)

  centered_target = target - target.mean()

  preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
  target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5

  return np.corrcoef(preds_p15, target_p15)[0, 1]

Generating Predictions from Experts

EXPERT-1 (XGBOOST CLASSIFIER)

In [44]:
expert1_test_pred = expert1.predict(test_df_x_resampled)

In [45]:
expert1_test_pred

array([1, 0, 2, ..., 2, 3, 3])

EXPERT-2 (RANDOM FOREST CLASSIFIER)

In [46]:
expert2_test_pred = expert2.predict(test_df_x_resampled)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    4.6s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    8.2s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   10.6s finished


In [47]:
expert2_test_pred

array([2., 4., 2., ..., 2., 4., 4.], dtype=float32)

EXPERT-3 (ADABOOST CLASSIFIER with DECISION TREE CLASSIFIER as BASE ESTIMATOR)

In [48]:
expert3_test_pred = expert3.predict(test_df_x_resampled)

In [49]:
expert3_test_pred

array([2., 4., 2., ..., 2., 0., 4.], dtype=float32)

EXPERT-4 (LOGISTIC REGRESSION)

In [50]:
expert4_test_pred = expert4.predict(test_df_x_resampled)

In [51]:
expert4_test_pred

array([2., 4., 2., ..., 2., 4., 3.], dtype=float32)

EXPERT-5 (CATBOOST CLASSIFIER)

In [52]:
expert5_test_pred = expert5.predict(test_df_x_resampled)

In [53]:
if expert5_test_pred.ndim > 1:
    expert5_test_pred = expert5_test_pred.ravel()

In [54]:
expert5_test_pred

array([2., 4., 2., ..., 2., 3., 3.])

EXPERT-6 (HISTOGRAM-BASED GRADIENT BOOST CLASSIFIER)

In [55]:
expert6_test_pred = expert6.predict(test_df_x_resampled)

In [56]:
expert6_test_pred

array([2., 0., 2., ..., 1., 4., 4.], dtype=float32)

Generating Predictions from the Meta-Model (LIGHTGBM)

In [57]:
meta_test_x = np.column_stack((expert1_test_pred, expert2_test_pred, expert3_test_pred, expert4_test_pred, expert5_test_pred, expert6_test_pred))

In [58]:
meta_test_x

array([[1., 2., 2., 2., 2., 2.],
       [0., 4., 4., 4., 4., 0.],
       [2., 2., 2., 2., 2., 2.],
       ...,
       [2., 2., 2., 2., 2., 1.],
       [3., 4., 0., 4., 3., 4.],
       [3., 4., 4., 3., 3., 4.]])

In [59]:
meta_test_y_pred = meta_model.predict(meta_test_x)

In [60]:
meta_test_y_pred

array([1.69097534, 1.47414962, 2.41139138, ..., 2.36907211, 2.68708907,
       2.7139873 ])

In [61]:
bins = [0.5, 1.5, 2.5, 3.5]

rounded_predictions = np.digitize(meta_test_y_pred, bins)

In [62]:
rounded_predictions

array([2, 1, 2, ..., 2, 3, 3])

In [63]:
label_frequency(rounded_predictions)

Label frequencies: {1: 51060, 2: 211615, 3: 83830}


In [None]:
test_df_y_resampled

Computing Relevant Evaluation Metrics

In [64]:
acc = accuracy_score(rounded_predictions, test_df_y_resampled)
print("Accuracy on Validation Set: ", acc)

Accuracy on Validation Set:  0.4350211396660943


Pearson's Correlation

In [65]:
pearson_corr, _ = stats.pearsonr(rounded_predictions, test_df_y_resampled)
print("Pearson Correlation:", pearson_corr)

Pearson Correlation: 0.01773061206926946


Reporting Class-wise Accuracies & F1 Scores

In [66]:
class_accuracies = {}

for class_label in np.unique(test_df_y_resampled):
    class_mask = (test_df_y_resampled == class_label)
    class_accuracy = accuracy_score(test_df_y_resampled[class_mask], rounded_predictions[class_mask])
    class_accuracies[class_label] = class_accuracy
    print(f"Accuracy for class {class_label}: {class_accuracy:.4f}")

print("\n")

f1_scores = f1_score(test_df_y_resampled, rounded_predictions, average=None)
for class_label, f1 in zip(np.unique(test_df_y_resampled), f1_scores):
    print(f"F1 Score for class {class_label}: {f1:.4f}")

Accuracy for class 0: 0.0000
Accuracy for class 1: 0.1673
Accuracy for class 2: 0.6938
Accuracy for class 3: 0.2702
Accuracy for class 4: 0.0000


F1 Score for class 0: 0.0000
F1 Score for class 1: 0.1929
F1 Score for class 2: 0.6251
F1 Score for class 3: 0.2448
F1 Score for class 4: 0.0000


Saving the Predictions in a Pickle File

In [None]:
# with open('numerai_fullprediction.pkl', 'wb') as f:
#     pickle.dump(rounded_predictions, f)

# print("Predictions saved successfully to numerai_fullprediction.pkl!")

Computing the NumerAi's Correlation Metric

In [67]:
rounded_predictions = pd.Series(rounded_predictions)

In [68]:
actual_corr = numerai_corr(rounded_predictions, test_df_y_resampled)
actual_corr

0.018142158486232806