Meta Learning

NumerAi

Tanmay Singh
2021569
CSAI
Class of '25

Importing the Dependencies

In [None]:
!pip install scipy
!pip install numpy
!pip install pandas
!pip install xgboost
!pip install seaborn
!pip install pyarrow
!pip install numerapi
!pip install imblearn
!pip install catboost
!pip install lightgbm
!pip install matplotlib
!pip install cloudpickle
!pip install mplcyberpunk
!pip install scikit-learn
!pip install torchsummary

In [None]:
import os
import gc
import time
import json
import pickle
import numpy as np
import pandas as pd
import seaborn as sb
import xgboost as xgb
import lightgbm as lgb
import cloudpickle as cp
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
from torchsummary import summary
from torch.utils.data import DataLoader, Dataset

In [None]:
from tqdm import tqdm
from scipy import stats
from numerapi import NumerAPI
from scipy.stats import pearsonr
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.neural_network import *
from sklearn.model_selection import *
from sklearn.cluster._kmeans import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Loading the Dataset

In [None]:
NumerAi = NumerAPI()

In [None]:
all_datasets = NumerAi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

In [None]:
DATA_VERSION = "v5.0"

current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("availbable", DATA_VERSION, "files:\n", current_version_files)

Creating a Feature Set

In [None]:
NumerAi.download_dataset(f"{DATA_VERSION}/features.json")

feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

In [None]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

In [None]:
feature_sets = feature_metadata["feature_sets"]
feature_sets.keys()

In [None]:
for feature_set in feature_sets:
  print(f'Feature Set: {feature_set:<25}', f'Size: {len(feature_sets[feature_set])}')

Loading the Saved Experts & the Meta-Model

In [None]:
with open('numerai_expert1.pkl', 'rb') as f:
    expert1 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_expert2.pkl', 'rb') as f:
    expert2 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_expert3.pkl', 'rb') as f:
    expert3 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_expert4.pkl', 'rb') as f:
    expert4 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_expert5.pkl', 'rb') as f:
    expert5 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_expert6.pkl', 'rb') as f:
    expert6 = pickle.load(f)
print("Model loaded successfully!")

In [None]:
with open('numerai_meta_model.pkl', 'rb') as f:
    meta_model = pickle.load(f)
print("Meta Model loaded successfully!")

Loading the Validation Set, with a 'medium' feature set

In [None]:
feature_set = feature_sets["medium"]

NumerAi.download_dataset(f"{DATA_VERSION}/validation.parquet")

val = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=["era", "target"] + feature_set
)


Preprocessing the Validation Set (in the same manner as the Training Set)

In [None]:
val.rename(columns=lambda x: f'feature {feature_set.index(x)}' if x in feature_set else x, inplace=True)
feature_set = val.columns.drop(["era", "target"])

In [None]:
val['era'] = val['era'].astype('int32')

In [None]:
val

In [None]:
unique_era = val['era'].unique()

In [None]:
val[val['era'] == unique_era[0]]

In [None]:
test_set = val
test_set

In [None]:
test_set.isna().any().any()
test_set['target'].value_counts()

Encoding the Numeric Values in the Target into corresponding labels (class 0 to class 4)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(test_set['target'])
test_set['target'] = label_encoder.transform(test_set['target'])

In [None]:
test_df_x = test_set.drop(['target'], axis=1, inplace=False)
test_df_y = test_set['target']

In [None]:
test_df_x

In [None]:
test_df_y

**************************************************************************************************************************************
"DUE TO RESOURCE CONSTRAINTS, THIS FILE WAS RUN WITH AT MAX 10 LAKH ENTRIES FOR 4 TIMES, ACCOUNTING TO THE SIZE OF THE VALIDATION SET"
**************************************************************************************************************************************

In [None]:
test_df_x_resampled = test_df_x[:1000000]          #Run-1
# test_df_x_resampled = test_df_x[1000000:2000000] #Run-2
# test_df_x_resampled = test_df_x[2000000:3000000] #Run-3
# test_df_x_resampled = test_df_x[3000000:]        #Run-4

test_df_x_resampled

In [None]:
test_df_y_resampled = test_df_y[:1000000]          #Run-1
# test_df_y_resampled = test_df_y[1000000:2000000] #Run-2
# test_df_y_resampled = test_df_y[2000000:3000000] #Run-3
# test_df_y_resampled = test_df_y[3000000:]        #Run-4

test_df_y_resampled

Function to compute Label Frequencies

In [None]:
def label_frequency(predictions):
    unique, counts = np.unique(predictions, return_counts=True)
    label_frequencies = dict(zip(unique, counts))
    print("Label frequencies:", label_frequencies)

Function to compute NumerAi Correlation

In [None]:
def numerai_corr(preds, target):
  ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
  gauss_ranked_preds = stats.norm.ppf(ranked_preds)

  centered_target = target - target.mean()

  preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
  target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5

  return np.corrcoef(preds_p15, target_p15)[0, 1]

Generating Predictions from Experts

EXPERT-1 (XGBOOST CLASSIFIER)

In [None]:
expert1_test_pred = expert1.predict(test_df_x_resampled)

In [None]:
expert1_test_pred

EXPERT-2 (RANDOM FOREST CLASSIFIER)

In [None]:
expert2_test_pred = expert2.predict(test_df_x_resampled)

In [None]:
expert2_test_pred

EXPERT-3 (ADABOOST CLASSIFIER with DECISION TREE CLASSIFIER as BASE ESTIMATOR)

In [None]:
expert3_test_pred = expert3.predict(test_df_x_resampled)

In [None]:
expert3_test_pred

EXPERT-4 (LOGISTIC REGRESSION)

In [None]:
expert4_test_pred = expert4.predict(test_df_x_resampled)

In [None]:
expert4_test_pred

EXPERT-5 (CATBOOST CLASSIFIER)

In [None]:
expert5_test_pred = expert5.predict(test_df_x_resampled)

In [None]:
if expert5_test_pred.ndim > 1:
    expert5_test_pred = expert5_test_pred.ravel()

In [None]:
expert5_test_pred

EXPERT-6 (HISTOGRAM-BASED GRADIENT BOOST CLASSIFIER)

In [None]:
expert6_test_pred = expert6.predict(test_df_x_resampled)

In [None]:
expert6_test_pred

Generating Predictions from the Meta-Model (LIGHTGBM)

In [None]:
meta_test_x = np.column_stack((expert1_test_pred, expert2_test_pred, expert3_test_pred, expert4_test_pred, expert5_test_pred, expert6_test_pred))

In [None]:
meta_test_x

In [None]:
meta_test_y_pred = meta_model.predict(meta_test_x)

In [None]:
meta_test_y_pred

In [None]:
bins = [0.5, 1.5, 2.5, 3.5]

rounded_predictions = np.digitize(meta_test_y_pred, bins)

In [None]:
rounded_predictions

In [None]:
label_frequency(rounded_predictions)

In [None]:
test_df_y_resampled

Computing Relevant Evaluation Metrics

In [None]:
acc = accuracy_score(rounded_predictions, test_df_y_resampled)
print("Accuracy on Validation Set: ", acc)

Pearson's Correlation

In [None]:
pearson_corr, _ = stats.pearsonr(rounded_predictions, test_df_y_resampled)
print("Pearson Correlation:", pearson_corr)

Saving the Predictions in a Pickle File

RUN-1

In [None]:
with open('numerai_prediction1.pkl', 'wb') as f:
    pickle.dump(rounded_predictions, f)

print("Predictions saved successfully to numerai_prediction1.pkl!")

RUN-2

In [None]:
# with open('numerai_prediction2.pkl', 'wb') as f:
#     pickle.dump(rounded_predictions, f)

# print("Predictions saved successfully to numerai_prediction2.pkl!")

RUN-3

In [None]:
# with open('numerai_prediction3.pkl', 'wb') as f:
#     pickle.dump(rounded_predictions, f)

# print("Predictions saved successfully to numerai_prediction3.pkl!")

RUN-4

In [None]:
# with open('numerai_prediction4.pkl', 'wb') as f:
#     pickle.dump(rounded_predictions, f)

# print("Predictions saved successfully to numerai_prediction4.pkl!")

Computing the NumerAi's Correlation Metric

In [None]:
rounded_predictions = pd.Series(rounded_predictions)

In [None]:
actual_corr = numerai_corr(rounded_predictions, test_df_y_resampled)
actual_corr