![](https://miro.medium.com/max/4000/1*g5PtFpII33P5EeHxFZN9YA.png)

# Requirements
1. Create an account on https://numer.ai/
2. Create an API key to interact with Numerai services: https://numer.ai/account
3. Update the current notebook on Google Colab (optional)

# Data loading and splitting
Dowload the dataset using numerapi and split the data according to the 'data_type' feature

In [None]:
!pip install numerapi
import numerapi
# get them here: https://numer.ai/account
PUB = 'your_public_key'
SEC = 'your_secret_key'
# some API calls
napi = numerapi.NumerAPI(verbosity="info", public_id=PUB, secret_key=SEC)
# download current dataset
napi.download_current_dataset(unzip=True)

In [None]:
folder = 'numerai_dataset_' + str(napi.get_current_round())

In [None]:
import pandas as pd
train = pd.read_csv(folder + r'/numerai_training_data.csv', index_col=0)
test = pd.read_csv(folder + r'/numerai_tournament_data.csv', index_col=0)

In [None]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_validation = test[test.data_type == 'validation'].iloc[:, :-1]
y_validation = test[test.data_type == 'validation'].iloc[:, -1])
X_test = test[test.data_type == 'test'].iloc[:, :-1]
X_live = test[test.data_type == 'live'].iloc[:, :-1]

## converts the target from continous to ordinal
- 0.00: 0
- 0.25: 1
- 0.50: 2
- 0.75: 3
- 1.00: 4

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_validation = le.transform(y_validation)

# Simple modeling
/!\ In order to have competitive models, feature engineering as to be done beforehand

In [None]:
datasets = [X_train, X_validation, X_test, X_live]
for dataset in datasets:
    dataset.drop(columns=['era', 'data_type'], inplace=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

ppl = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', max_iter=100))

ppl.fit(X_train, y_train)
pred_train = ppl.predict(X_train)
pred_validation = ppl.predict(X_validation)
pred_test = ppl.predict(X_test)
pred_live = ppl.predict(X_live)

# Evaluation
compare the predictions between train and validation

In [None]:
display(spearmanr(pred_train, y_train))
display(spearmanr(pred_validation, y_validation))

# Deployment
Concatenate all the predictions and upload them to the platform

In [None]:
# small hack because the platform does not allow predictions that are either 0 or 1.
def fix_prob(x):
    if x == 0:
        x += 0.01
    elif x == 1:
        x -= 0.01
    return x

In [None]:
name_preds = 'prediction_' + train.iloc[:, -1].name.split('_')[1]
name_preds

In [None]:
prediction = pd.concat([pd.Series(index=X_validation.index, data=y_validation),
                        pd.Series(index=X_test.index, data=pred_test),
                        pd.Series(index=X_live.index, data=pred_live)], axis=0)
# return the right value using inverse_transform
prediction = pd.Series(index=prediction.index, data=le.inverse_transform(prediction), name=name_preds).round(2)
prediction = prediction.apply(fix_prob)
prediction.to_csv('preds.csv', header=True)

In [None]:
submission_id = napi.upload_predictions("preds.csv")
# check submission status
napi.submission_status()