First you need to initialize your python packages

In [1]:
# numerai's libarary for working with signals
from numerapi import SignalsAPI
import pandas as pd

This will download the necessary data into a parquet file

In [None]:
api = SignalsAPI()
api.download_dataset(
	"signals/v2.0/train.parquet",
	"train.parquet"
)
api.download_dataset(
	"signals/v2.0/validation.parquet",
	"validation.parquet"
)
api.download_dataset(
	"signals/v2.0/live.parquet",
	"live.parquet"
)
api.download_dataset(
	"signals/v2.0/live_example_preds.parquet",
	"live_example_preds.parquet"
)
api.download_dataset(
	"signals/v2.0/validation_example_preds.parquet",
	"validation_example_preds.parquet"
)

In [2]:
df = pd.read_parquet("train.parquet")
print(df.head())

  numerai_ticker feature_country composite_figi        date data_type  \
0      000270 KR              KR           None  2003-01-03     train   
1      000810 KR              KR           None  2003-01-03     train   
2      001040 KR              KR           None  2003-01-03     train   
3      001230 KR              KR           None  2003-01-03     train   
4      002790 KR              KR           None  2003-01-03     train   

   feature_adv_20d_factor  feature_beta_factor  feature_book_to_price_factor  \
0                1.042680                  NaN                      0.059775   
1                1.146930                  NaN                     -0.426994   
2                0.918285                  NaN                      0.571037   
3                0.223996                  NaN                      1.105375   
4                0.656947                  NaN                      0.606041   

   feature_dividend_yield_factor  feature_earnings_yield_factor  ...  \
0       

Pay attention to what features and targets each ticker has

In [3]:
print(df.iloc[0])

numerai_ticker                            000270 KR
feature_country                                  KR
composite_figi                                 None
date                                     2003-01-03
data_type                                     train
feature_adv_20d_factor                      1.04268
feature_beta_factor                             NaN
feature_book_to_price_factor               0.059775
feature_dividend_yield_factor              0.135088
feature_earnings_yield_factor             -0.073863
feature_growth_factor                      0.605494
feature_impact_cost_factor                      NaN
feature_market_cap_factor                  1.038619
feature_momentum_12w_factor                     NaN
feature_momentum_26w_factor                     NaN
feature_momentum_52w_factor                     NaN
feature_momentum_52w_less_4w_factor             NaN
feature_ppo_60d_130d_country_ranknorm           NaN
feature_ppo_60d_90d_country_ranknorm            NaN
feature_pric

Get an idea for the span of dates that will be used

In [4]:
print(df["date"].unique())

<StringArray>
['2003-01-03', '2003-01-10', '2003-01-17', '2003-01-24', '2003-01-31',
 '2003-02-07', '2003-02-14', '2003-02-21', '2003-02-28', '2003-03-07',
 ...
 '2012-10-26', '2012-11-02', '2012-11-09', '2012-11-16', '2012-11-23',
 '2012-11-30', '2012-12-07', '2012-12-14', '2012-12-21', '2012-12-28']
Length: 522, dtype: string


One of the most important parts of data science is learning how/when to clean data and when to throw out the trash data

In [5]:
nan_counts = df.isna().sum()    # Count NaN values in each column
print(nan_counts[nan_counts > 0].sort_values(ascending=False))

composite_figi                           2536318
feature_momentum_52w_less_4w_factor       292462
feature_momentum_52w_factor               291298
feature_trix_130d_country_ranknorm        199436
feature_momentum_26w_factor               127779
feature_trix_60d_country_ranknorm          80501
feature_momentum_12w_factor                58469
feature_ppo_60d_130d_country_ranknorm      57437
feature_rsi_130d_country_ranknorm          57418
feature_beta_factor                        52902
feature_volatility_factor                  52902
feature_impact_cost_factor                 52902
feature_rsi_90d_country_ranknorm           39396
feature_ppo_60d_90d_country_ranknorm       39390
feature_rsi_60d_country_ranknorm           25989
feature_growth_factor                       4582
feature_book_to_price_factor                 388
feature_earnings_yield_factor                384
dtype: int64


In [6]:
len(df)

2536318

In [7]:
df.drop(columns=["composite_figi"], inplace=True)

In [8]:
# 1. Select only the feature columns
feature_cols = [col for col in df.columns if col.startswith("feature_")]

# 2. Count missing values per row in those columns
missing_counts = df[feature_cols].isna().sum(axis=1)

# 3. Filter out rows with 7 or more missing feature values
df_clean = df[missing_counts < 1]   # Adjust the threshold as needed
# we are going to go nuclear right now (trash all NaN values)

In [10]:
nan_counts = df_clean.isna().sum()    # Count NaN values in each column
print(nan_counts[nan_counts > 0].sort_values(ascending=False))

Series([], dtype: int64)


We have now done the bare minimum amount of cleaning to start using this data in a meaningful way.
In the next Jupyter notebook files we are going to start using this data to predict future price action

In [11]:
print(df_clean.iloc[0])

numerai_ticker                            000100 KR
feature_country                                  KR
date                                     2004-01-09
data_type                                     train
feature_adv_20d_factor                     0.009756
feature_beta_factor                       -0.237552
feature_book_to_price_factor               -0.36067
feature_dividend_yield_factor             -0.076441
feature_earnings_yield_factor              0.266838
feature_growth_factor                     -1.264913
feature_impact_cost_factor                -0.218123
feature_market_cap_factor                 -0.519312
feature_momentum_12w_factor                0.178576
feature_momentum_26w_factor               -0.178896
feature_momentum_52w_factor                0.030522
feature_momentum_52w_less_4w_factor       -0.179817
feature_ppo_60d_130d_country_ranknorm     -0.106543
feature_ppo_60d_90d_country_ranknorm       0.076032
feature_price_factor                       1.069491
feature_rsi_