In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import numerapi 
import sklearn

In [2]:
#from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
acct = "graphman" # or "jhmuller"
methods = ["RFR"]
if "RFR" in methods:
    print ("Using RFR")
if "PCR" in methods:
    print("Using PCR")

Using RFR


In [4]:
temp = pd.read_csv("all.napi", index_col=None)
acct = "graphman"
temp = temp[temp['account']==acct]
public_id = temp.loc[temp['type']=='public', 'value'].values[0]
private_key = temp.loc[temp['type']=='private', 'value'].values[0]

In [5]:
# some API calls do not require logging in
napi = numerapi.NumerAPI(verbosity="info",
                        public_id=public_id,
                        secret_key=private_key)

In [6]:
data_dir = "./datadir"
cur_round = napi.get_current_round()
data_path = os.path.join(data_dir, "numerai_dataset_"+str(cur_round))
if not os.path.exists(data_path):
    napi.download_current_dataset(dest_path=data_dir, unzip=True)
files = os.listdir(data_path)
fdict = {}
for file in files:
    fpath = os.path.join(data_path, file)
    fsize = os.path.getsize(fpath)
    fdict[file] = [fpath, fsize]
fdict    

{'example_model.py': ['./datadir\\numerai_dataset_233\\example_model.py',
  8536],
 'example_model.r': ['./datadir\\numerai_dataset_233\\example_model.r', 9280],
 'example_predictions_target_kazutsugi.csv': ['./datadir\\numerai_dataset_233\\example_predictions_target_kazutsugi.csv',
  41464549],
 'numerai_tournament_data.csv': ['./datadir\\numerai_dataset_233\\numerai_tournament_data.csv',
  2621558835],
 'numerai_training_data.csv': ['./datadir\\numerai_dataset_233\\numerai_training_data.csv',
  794941377],
 'numerox_example.py': ['./datadir\\numerai_dataset_233\\numerox_example.py',
  990]}

In [10]:
train = pd.read_csv(fdict['numerai_training_data.csv'][0])
train.shape

(501808, 314)

In [11]:
train.columns
x_cols = [c for c in train.columns if c.startswith('feature')]
y_col = train.columns[-1]

In [12]:
if 'RFR' in methods:
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(n_estimators=3, max_depth=10)
    X_train = train.loc[:, x_cols]
    y_train = train.loc[:, y_col]
    rfr.fit(X_train, y_train)

In [13]:
rfr

RandomForestRegressor(max_depth=10, n_estimators=3)

In [None]:
if 'GBR' in methods:
    from sklearn.ensemble import GradientBoostingRegressor
    gbr = GradientBoostingRegressor()
    X_train = train.loc[:, x_cols]
    y_train = train.loc[:, y_col]
    gbr.fit(X_train, y_train)

In [None]:
if 'PCR' in methods:
    from sklearn.linear_model import LinearRegression
    from sklearn.decomposition import PCA
    pcr = make_pipeline(StandardScaler(), PCA(n_components=40), LinearRegression())
    pca = PCA(n_components=40)
    X_train = train.loc[:, x_cols]
    y_train = train.loc[:, y_col]
    pcr.fit(X_train, y_train)

In [None]:
if 'PLS' in methods:
    from sklearn.cross_decomposition import PLSRegression    
    pls = PLSRegression(n_components=40)
    X_train = train.loc[:, x_cols]
    y_train = train.loc[:, y_col]
    pls.fit(X_train, y_train)

In [None]:
del train

In [None]:
test = pd.read_csv(fdict['numerai_tournament_data.csv'][0])

In [None]:
test[x_cols] = test[x_cols].apply(np.float32)

In [None]:
y_pred = pls.predict(test[x_cols])

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
ax.grid()
ax.hist(y_pred)
#dir(ax)

In [None]:
min_y = min(y_pred)
max_y = max(y_pred)

mean_y = np.mean(y_pred)
print(min_y, max_y, mean_y)
adj_pred = (y_pred - min_y)/(max_y - min_y)
print(min(adj_pred), max(adj_pred))

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
ax.grid()
ax.hist(adj_pred)
#dir(ax)

In [None]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = test["id"].to_frame()
predictions_df["prediction_kazutsugi"] = adj_pred
predictions_df.head()
tourn = napi.get_tournaments()
tnum = tourn[0]['tournament']
tfile = f"jhmuller_predictions_{tnum}.csv"
predictions_df.to_csv(tfile, index=False)

In [None]:
napi.upload_predictions(tfile)


In [None]:
fig, ax = plt.subplots(figsize=(10,3))
ax.grid()
ax.scatter(y_pred, y_valid)
#dir(ax)

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
ax.grid()
ax.plot(np.cumsum(pca.explained_variance_ratio_))
#dir(ax)

In [None]:
pcr = make_pipeline(StandardScaler(), PCA(n_components=50), LinearRegression())
for last in (range(1,5)):
    X_train = train.loc[train['era'] == "era"+str(last), x_cols]
    y_train = train.loc[train['era'] == "era"+str(last), y_col]
    X_valid = train.loc[train['era'] == "era"+str(last+1), x_cols]
    y_valid = train.loc[train['era'] == "era"+str(last+1), y_col]
    if True:
        pcr.fit(X_train, y_train)
        pca = pcr.named_steps['pca']  # retrieve the PCA step of the pipeline
        y_pred = pcr.predict(X_valid)
        pca_rmse = (np.sum(y_valid - y_pred)**2 / y_valid.shape[0])**0.5
        pca_mad = (np.sum(np.abs(y_valid - y_pred)) / y_valid.shape[0])        
        print(last, pca_mad)
    rfreg = RandomForestRegressor(n_estimators=300, max_depth=40, random_state=0)
    rfreg.fit(X_train, y_train)
    y_pred = rfreg.predict(X_valid)
    rf_rmse = (np.sum(y_valid - y_pred)**2 / y_valid.shape[0])**0.5
    rf_mad = (np.sum(np.abs(y_valid - y_pred)) / y_valid.shape[0])      
    print(last, rf_mad)    