<a href="https://colab.research.google.com/github/hiydavid/numerai-models/blob/main/numerai_tfdf_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Numerai TFDF Model * WIP *
---

TFDF model
* Tensorflow Decision Forest model


---
# Load data & libraries

In [1]:
# install dependencies
!pip install -Uqq numerapi==2.4.5 tensorflow==2.5.1 tensorflow_decision_forests

In [2]:
# import dependencies
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# download the latest training dataset (takes around 30s)
training_data_url = "https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz" 
training_data = pd.read_csv(training_data_url)

In [4]:
# # download the latest tournament dataset (takes around 30s)
# tournament_data_url = "https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz"
# tournament_data = pd.read_csv(tournament_data_url)

--- 
# Convert dataframes to datasets

In [5]:
# split training into training & validation
training_data, testing_data = train_test_split(training_data, test_size = 0.2)

training_data.shape, testing_data.shape #, tournament_data.shape

((401446, 314), (100362, 314))

In [6]:
# find only the feature columns
feature_cols = training_data.columns[
    training_data.columns.str.startswith('feature')
    ].append(pd.Index(['target']))

In [7]:
# convert features data to ds object
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    training_data[feature_cols], 
    label="target",
    task=tfdf.keras.Task.REGRESSION
)

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    testing_data[feature_cols], 
    label="target",
    task=tfdf.keras.Task.REGRESSION
)

# live_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
#     tournament_data[feature_cols], 
#     label="target",
#     task=tfdf.keras.Task.REGRESSION
# )

In [9]:
# create a TFDF model
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel]

---
# Train model

In [10]:
# configure model
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)

In [12]:
# fit model
model.fit(x=train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fa7b09792d0>

In [15]:
# evaluate the model
model.compile(metrics=["accuracy"])
print(model.evaluate(test_ds))

[0.0, 0.04942109435796738]


---
# Save model

In [None]:
model.save("/tmp/my_saved_model")

---
# Predict

In [None]:
# select the feature columns from the tournament data
live_features = tournament_data[feature_cols]

In [None]:
# predict the target on the live features
predictions = model.predict(live_features)

In [None]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = tournament_data["id"].to_frame()
predictions_df["prediction_kazutsugi"] = predictions
predictions_df.head()

Unnamed: 0,id,prediction_kazutsugi
0,n0003aa52cab36c2,0.481608
1,n000920ed083903f,0.492837
2,n0038e640522c4a6,0.530817
3,n004ac94a87dc54b,0.497083
4,n0052fe97ea0c05f,0.503089


--- 
# Submit

In [None]:
# Get your API keys and model_id from https://numer.ai/submit
public_id = "WPZQRGGZADFULXQ3RRMR6OJQWALLSQTM"
secret_key = "T5ZJLYRT4652F244OLVRTITXH5W4TJRQ553RLJZLQTX7EPRU647WU33ITZPHP3FU"
model_id = "4d9bcd2c-69cb-4e17-a3f3-a4b2530cbcb8"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
# Upload your predictions
predictions_df.to_csv("predictions.csv", index=False)
submission_id = napi.upload_predictions("predictions.csv", model_id=model_id)

2021-08-29 05:19:29,781 INFO numerapi.base_api: uploading predictions...
