In [5]:
import json
import os
import re

import numerapi
import numpy as np
import pandas as pd

In [None]:
DIR = "data"


def download_current_data(directory: str):
    """
        Downloads the data for the current round
        :param directory: The path to the directory where the data needs to be saved
        """
    current_round = NAPI.get_current_round()
    if os.path.isdir(f"{directory}/numerai_dataset_{current_round}/"):
        print(f"You already have the newest data! Current round is: {current_round}")
    else:
        print(f"Downloading new data for round: {current_round}!")
        NAPI.download_current_dataset(dest_path=directory, unzip=True)


def load_data(directory: str, reduce_memory: bool = True) -> tuple:
    """
        Get data for current round
        :param directory: The path to the directory where the data needs to be saved
        :return: A tuple containing the datasets
        """
    print("Loading the data")
    full_path = f"{directory}/numerai_dataset_{NAPI.get_current_round()}/"
    train_path = full_path + "numerai_training_data.csv"
    test_path = full_path + "numerai_tournament_data.csv"
    train = pd.read_csv(train_path, encoding='ascii')
    test = pd.read_csv(test_path, encoding='ascii')
    # Reduce all features to 32-bit floats
    if reduce_memory:
        num_features = [f for f in train.columns if f.startswith("feature")]
        train[num_features] = train[num_features].astype(np.float32)
        test[num_features] = test[num_features].astype(np.float32)
    val = test[test["data_type"] == "validation"]
    test = test[test["data_type"] != "validation"]
    return train, val, test


# Download, unzip and load data
download_current_data(DIR)
train, val, test = load_data(DIR, reduce_memory=True)

Downloading new data for round: 254!


data/numerai_dataset_254.zip:  99%|█████████▉| 392M/394M [00:29<00:00, 15.3MB/s]    2021-03-07 23:54:26,803 INFO numerapi.base_api: unzipping file...


Loading the data


data/numerai_dataset_254.zip: 394MB [01:55, 3.42MB/s]                           


In [2]:
%%time

directory = 'data'
example_public_id = "WUTL537K2HLISXIDDHFTQF2LZ5MO67HV"
example_secret_key = "3EKVW2XQ7NTR6FDA77ZOJF4UWHOE56QYZ7VZ7CZ5WYVIMEARXEUNELYXOPH4S35J"
NAPI = numerapi.NumerAPI(example_public_id, example_secret_key)

with open('../dtypes.json') as f:
    dtypes = json.load(f)

full_path = f"{directory}/numerai_dataset_{NAPI.get_current_round()}/"
train_path = full_path + "numerai_training_data.csv"
test_path = full_path + "numerai_tournament_data.csv"
train = pd.read_csv(train_path, dtype=dtypes)
test = pd.read_csv(test_path, dtype=dtypes)

CPU times: user 22.6 s, sys: 1.79 s, total: 24.4 s
Wall time: 25.1 s


In [3]:
full = train.append(test).reset_index(drop=True)

In [13]:
full.head(-5)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.00,0.50,0.25,0.00,0.50,0.25,0.25,...,1.00,1.00,0.75,0.50,0.75,0.50,1.00,0.50,0.75,0.50
1,n0014af834a96cdd,era1,train,0.00,0.00,0.00,0.25,0.50,0.00,0.00,...,1.00,1.00,0.00,0.00,0.75,0.25,0.00,0.25,1.00,0.25
2,n001c93979ac41d4,era1,train,0.25,0.50,0.25,0.25,1.00,0.75,0.75,...,0.25,0.50,0.00,0.00,0.50,1.00,0.00,0.25,0.75,0.25
3,n0034e4143f22a13,era1,train,1.00,0.00,0.00,0.50,0.50,0.25,0.25,...,1.00,1.00,0.75,0.75,1.00,1.00,0.75,1.00,1.00,0.25
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.00,0.25,0.50,...,0.75,0.75,0.25,0.50,0.75,0.00,0.50,0.25,0.75,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167696,nff84ea14908ee37,eraX,live,1.00,1.00,1.00,1.00,0.25,0.50,0.50,...,0.50,0.75,0.75,1.00,0.75,0.00,1.00,0.75,0.25,
2167697,nffaa28ef26c35a6,eraX,live,0.25,0.50,0.50,0.75,0.75,0.75,0.75,...,0.50,0.25,0.50,1.00,0.00,0.75,0.50,0.50,0.50,
2167698,nffb3bda8ebbc776,eraX,live,0.50,1.00,1.00,0.75,0.50,0.00,0.00,...,0.25,0.25,1.00,1.00,0.50,0.50,0.75,0.50,0.00,
2167699,nffbbdbcdd482c03,eraX,live,0.75,0.00,0.00,0.00,0.50,1.00,1.00,...,0.75,0.75,0.50,0.75,0.75,1.00,0.25,0.75,1.00,


In [9]:
full.to_feather('data/round253.feather')

In [10]:
df = pd.read_feather('data/round253.feather')
train = df[df.data_type.isin(['train', 'validation'])]
test = df[df.data_type.isin(['test', 'live'])]

train.reset_index(drop=True).to_feather('data/train.feather')
test.reset_index(drop=True).to_feather('data/test.feather')

In [11]:
test.era.value_counts()

eraX      5396
era948    5367
era947    5360
era946    5360
era945    5338
          ... 
era579    4859
era685    4857
era577    4856
era576    4827
era575    4813
Name: era, Length: 301, dtype: Int64

In [12]:
test.id.value_counts()

n096e83f5d55c387    1
n223393fc1df65cc    1
nd608429f7bdafa9    1
nc0a072f46394bbc    1
n85d14c0a7215b35    1
                   ..
n0f91dc7a6f92f1d    1
n9226d8c6fb05293    1
na57ebb142c96d37    1
nefc6e5277139a63    1
n128d9538a0b2188    1
Name: id, Length: 1528119, dtype: Int64