In [None]:
import logging

import os
import json
import time
import joblib

import numpy as np
import random as rn
import pandas as pd

from numerapi import NumerAPI
from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)

public_id = "FILLTHISIN"
secret_key = "FILLTHISIN"
napi = NumerAPI(public_id=public_id, secret_key=secret_key,verbosity='info')

In [None]:
if napi.check_new_round():
    print("new round has started within the last 24hours!")
    
else:
    print("no new round within the last 24 hours")

In [None]:
current_round = napi.get_current_round()
previous_round = current_round - 1
logging.info("Latest numerai dataset number is: %s'", current_round)
logging.info("Previous numerai dataset number is: %s'", previous_round)

In [None]:
#check if all models already have submitted
example_model_id = napi.get_models()['FILLTHISIN']
#example_model2_id = napi.get_models()['FILLTHISIN']

# check submission status
if( napi.submission_status(example_model_id) ):
    example_model_submitted = True
else:
    example_model_submitted = False

# check submission status
#if( napi.submission_status(example_model2_id) ):
#    example_model2_submitted = True
#else:
#    example_model2_submitted = False

if example_model_submitted:  #and example_model2_submtted: 
    allmodels_submitted = True
else:
    allmodels_submitted = False

In [None]:
SRC = "v4"

CURRENT_TRAININGDATA = SRC + "/train.parquet"
CURRENT_LIVEDATA = SRC + f"/live_{current_round}.parquet"
CURRENT_VALIDATIONDATA = SRC + "/validation.parquet"

In [None]:
#General. minimal training data models based on the small dataset. Change the feature_set here or use your own feature sets (selection/engineering)
if not allmodels_submitted:
    print('Reading minimal training data')
    # read the feature metadata and get a feature set (or all the features)
    with open("v4/features.json", "r") as f:
        feature_metadata = json.load(f)
    features = feature_metadata["feature_sets"]["small"] # get the small feature set
    read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

    #load data
    logging.info("loading data")
    training_data = pd.read_parquet(f'v4/train.parquet', columns=read_columns)
    live_data = pd.read_parquet(CURRENT_LIVEDATA, columns=read_columns)

    # getting the per era correlation of each feature vs the target
    all_feature_corrs = training_data.groupby(ERA_COL).apply(
        lambda era: era[features].corrwith(era[TARGET_COL])
    )

    # find the riskiest features by comparing their correlation vs
    # the target in each half of training data; we'll use these later
    riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

    nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()

    # check for nans and fill nans
    if nans_per_col.any():
        total_rows = len(live_data[live_data["data_type"] == "live"])
        print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
        print(f"out of {total_rows} total rows")
        print(f"filling nans with 0.5")
        live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
    else:
        print("No nans in the features this week!")
else:
    print("All models already submitted")

In [None]:
#Example_Model, this should be customized and extended to your own model(s)
if not example_model_submitted:
    
    PREDICTION_FILE = "./predictions/prediction_examplemodel_v4_" + str(current_round) + ".csv"
    PREVIOUS_PREDICTION_FILE = "./predictions/prediction_examplemodel_v4_" + str(previous_round) + ".csv"
    MODEL_FILE = "./models/examplemodel_v4/model.pkl"

    logging.info("PREDICTION_FILE is: %s", PREDICTION_FILE)
    logging.info("PREVIOUS PREDICTION FILE is: %s", PREVIOUS_PREDICTION_FILE)
    logging.info("MODEL_FILE is: %s", MODEL_FILE)

    if(os.path.isfile(PREVIOUS_PREDICTION_FILE)):
        os.remove(PREVIOUS_PREDICTION_FILE)
        #Printing the confirmation message of deletion
        print("File Deleted successfully")
    else:
        print("File does not exist")
    
    model = joblib.load(MODEL_FILE)

    live_data["preds"] = model.predict(live_data[features])

    live_data[f"preds_neutral_riskiest_50"] = neutralize(
        df=live_data,
        columns=[f"preds"],
        neutralizers=riskiest_features,
        proportion=1.0,
        normalize=True,
        era_col=ERA_COL
    )

    # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
    live_data["prediction"] = live_data[f"preds_neutral_riskiest_50"].rank(pct=True)
    live_data["prediction"].to_csv(PREDICTION_FILE)
    
    i = 1
    while i < 3:    
        try:
            submission_id = napi.upload_predictions(PREDICTION_FILE, model_id=example_model_id)
            time.sleep(3)
            i = 3
            print("submitted example_model")
        except:
            time.sleep(10) # Sleep for 10 seconds
            i += 1
else:
    print("example_model already submitted")

In [None]:
#Example_Model, this should be customized and extended to your own model(s)
# if not example_model2_submitted:
    
#     PREDICTION_FILE = "./predictions/prediction_examplemodel2_v4_" + str(current_round) + ".csv"
#     PREVIOUS_PREDICTION_FILE = "./predictions/prediction_examplemodel2_v4_" + str(previous_round) + ".csv"
#     MODEL_FILE = "./models/examplemodel2_v4/model.pkl"

#     logging.info("PREDICTION_FILE is: %s", PREDICTION_FILE)
#     logging.info("PREVIOUS PREDICTION FILE is: %s", PREVIOUS_PREDICTION_FILE)
#     logging.info("MODEL_FILE is: %s", MODEL_FILE)

#     if(os.path.isfile(PREVIOUS_PREDICTION_FILE)):
#         os.remove(PREVIOUS_PREDICTION_FILE)
#         #Printing the confirmation message of deletion
#         print("File Deleted successfully")
#     else:
#         print("File does not exist")
    
#     model = joblib.load(MODEL_FILE)

#     live_data["preds"] = model.predict(live_data[features])

#     live_data[f"preds_neutral_riskiest_50"] = neutralize(
#         df=live_data,
#         columns=[f"preds"],
#         neutralizers=riskiest_features,
#         proportion=1.0,
#         normalize=True,
#         era_col=ERA_COL
#     )

#     # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
#     live_data["prediction"] = live_data[f"preds_neutral_riskiest_50"].rank(pct=True)
#     live_data["prediction"].to_csv(PREDICTION_FILE)
    
#     i = 1
#     while i < 3:    
#         try:
#             submission_id = napi.upload_predictions(PREDICTION_FILE, model_id=example_model2_id)
#             time.sleep(3)
#             i = 3
#             print("submitted example_model2")
#         except:
#             time.sleep(10) # Sleep for 10 seconds
#             i += 1
# else:
#     print("example_model2 already submitted")