In [1]:
import pandas as pd
import numpy as np
import scipy
from joblib import load

dtype_dict = load('dtype_dict1.joblib') #from the Feather Format tutorial series - save RAM!
train_datalink = 'https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz'
df = pd.read_csv(train_datalink, dtype=dtype_dict)
features = [c for c in df if c.startswith("feature")]


from sklearn.preprocessing import MinMaxScaler
def _neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()
def _normalize(df):
    X = (df.rank(method="first") - 0.5) / len(df)
    return scipy.stats.norm.ppf(X)
def normalize_and_neutralize(df, columns, by, proportion=1.0):
    # Convert the scores to a normal distribution
    df[columns] = _normalize(df[columns])
    df[columns] = _neutralize(df, columns, by, proportion)
    return df[columns]

df['neutralized_target'] = df.groupby("era").apply(lambda x: normalize_and_neutralize(x, ["target"], features, 0.8)) #set your proportion to the desired amount of feature neutralization you want
scaler = MinMaxScaler()
df['neutralized_target'] = scaler.fit_transform(df[['neutralized_target']]) # transform back to 0-1

In [2]:
df['neutralized_target'].describe()

count    501808.000000
mean          0.491187
std           0.116901
min           0.000000
25%           0.412584
50%           0.491255
75%           0.569846
max           1.000000
Name: neutralized_target, dtype: float64

In [None]:
import pyarrow.feather as feather

feather.write_feather(df, 'df_neut_training_compressed_nomi80.feather', compression='lz4')