In [32]:
import pandas as pd
import json
import tensorflow as tf

In [1]:
# NumerAPI is the official Python API client for Numerai
import numerapi
napi = numerapi.NumerAPI('WF44AYPGNLHJX3M2ECMNHR45FNNP4UMB', '5DFFWZOEZHW2QLBOFZGQTRKCNDGMP5XU2ASCPX3BEQ4YSGY2I64BUXWSIM5BPQV2')

# v4.1 is the latest version of the dataset
[f for f in napi.list_datasets() if f.startswith("v4.1")]

['v4.1/features.json',
 'v4.1/live.parquet',
 'v4.1/live_example_preds.csv',
 'v4.1/live_example_preds.parquet',
 'v4.1/live_int8.parquet',
 'v4.1/meta_model.parquet',
 'v4.1/train.parquet',
 'v4.1/train_int8.parquet',
 'v4.1/validation.parquet',
 'v4.1/validation_example_preds.csv',
 'v4.1/validation_example_preds.parquet',
 'v4.1/validation_int8.parquet']

In [3]:
# Download the training data and feature metadata
# This will take about 1 minute 🍵
napi.download_dataset("v4.1/train.parquet");
napi.download_dataset("v4.1/features.json");

# In this notebook, we will be using the "small" feature set to save time and memory
# In practice, you will likely want to use all the features to maximize your model's performance
feature_metadata = json.load(open("v4.1/features.json")) 
feature_cols = feature_metadata["feature_sets"]["small"]

# Load the training data but only the "small" subset of features
training_data = pd.read_parquet("v4.1/train.parquet", columns= ["era"] + feature_cols + ["target"]) 

# Let's take a look
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)
training_data

2023-07-19 17:26:14,364 INFO numerapi.utils: starting download
v4.1/train.parquet: 1.45GB [39:34, 612kB/s]                                                                            
2023-07-19 18:05:49,284 INFO numerapi.utils: starting download
v4.1/features.json: 703kB [00:00, 1.19MB/s]                                                                            


Unnamed: 0_level_0,era,feature_bijou_penetrant_syringa,...,feature_unventilated_sollar_bason,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
n003bba8a98662e4,0001,0.00,...,0.00,0.25
n003bee128c2fcfc,0001,0.50,...,0.25,0.75
n0048ac83aff7194,0001,0.25,...,1.00,0.25
n00691bec80d3e02,0001,0.75,...,0.75,0.75
n00b8720a2fdc4f2,0001,0.00,...,0.00,0.50
...,...,...,...,...,...
nffcc1dbdf2212e6,0574,1.00,...,1.00,0.75
nffd71b7f6a128df,0574,0.75,...,0.50,0.25
nffde3b371d67394,0574,0.75,...,1.00,0.25
nfff1a1111b35e84,0574,0.25,...,0.00,0.50


In [46]:
df1 = training_data
df1.pop('era')
df1_target = df1['target']
df1.pop('target')

df1_target.head()

id
n003bba8a98662e4    0.25
n003bee128c2fcfc    0.75
n0048ac83aff7194    0.25
n00691bec80d3e02    0.75
n00b8720a2fdc4f2    0.50
Name: target, dtype: float32

In [47]:
df1

Unnamed: 0_level_0,feature_bijou_penetrant_syringa,feature_burning_phrygian_axinomancy,...,feature_unswaddled_inenarrable_goody,feature_unventilated_sollar_bason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
n003bba8a98662e4,0.00,0.00,...,0.25,0.00
n003bee128c2fcfc,0.50,0.75,...,0.75,0.25
n0048ac83aff7194,0.25,0.25,...,0.75,1.00
n00691bec80d3e02,0.75,0.75,...,0.50,0.75
n00b8720a2fdc4f2,0.00,0.00,...,1.00,0.00
...,...,...,...,...,...
nffcc1dbdf2212e6,1.00,0.25,...,0.50,1.00
nffd71b7f6a128df,0.75,0.00,...,0.25,0.50
nffde3b371d67394,0.75,0.00,...,1.00,1.00
nfff1a1111b35e84,0.25,0.75,...,1.00,0.00


In [48]:
tf.convert_to_tensor(df1)

<tf.Tensor: shape=(2420521, 32), dtype=float32, numpy=
array([[0.  , 0.  , 0.5 , ..., 0.5 , 0.25, 0.  ],
       [0.5 , 0.75, 0.5 , ..., 0.5 , 0.75, 0.25],
       [0.25, 0.25, 1.  , ..., 1.  , 0.75, 1.  ],
       ...,
       [0.75, 0.  , 1.  , ..., 1.  , 1.  , 1.  ],
       [0.25, 0.75, 0.  , ..., 0.  , 1.  , 0.  ],
       [1.  , 0.  , 1.  , ..., 1.  , 0.  , 1.  ]], dtype=float32)>

In [49]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(df1)

In [50]:
normalizer(df1.iloc[:3])


<tf.Tensor: shape=(3, 32), dtype=float32, numpy=
array([[-1.4139656e+00, -1.4139713e+00,  2.3273486e-04,  1.4144582e+00,
         2.4200717e-04,  2.3012166e-04, -1.4139646e+00,  2.2877278e-04,
         2.4411408e-04,  7.0734823e-01, -1.4139825e+00, -1.4139813e+00,
         1.4144485e+00, -1.4139746e+00,  1.4144351e+00,  7.0734710e-01,
        -1.4139907e+00, -1.4139552e+00,  1.4144452e+00,  2.4048988e-04,
        -1.4139848e+00,  1.4144430e+00,  1.4144469e+00, -1.4139719e+00,
        -1.4139791e+00,  1.4144425e+00,  7.0734525e-01,  1.4144350e+00,
         1.4144635e+00,  2.3399919e-04, -7.0686185e-01, -1.4139805e+00],
       [ 2.4765483e-04,  7.0734501e-01,  2.3273486e-04, -7.0686293e-01,
         2.4200717e-04,  7.0733649e-01,  1.4144582e+00,  7.0733464e-01,
         1.4144549e+00,  2.4175423e-04,  2.3029024e-04, -7.0687532e-01,
         2.3720208e-04, -7.0686764e-01,  1.4144351e+00,  2.3998426e-04,
        -7.0688480e-01,  2.5642107e-04, -7.0686799e-01, -7.0686620e-01,
         1.414

In [60]:
def get_basic_model():
  model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

In [59]:
model = get_basic_model()
model.fit(df1, df1_target, epochs=20, batch_size=128)

ValueError: Input 0 of layer "conv1d_1" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (None, 32)