# Riskiest Features

---
## Load libraries & training data

In [4]:
# change working directory
import os
# os.chdir('/home/ubuntu/Projects/numerai-models') # for lambda cloud
os.chdir('/Users/davidhuang/Documents/GitHub/numerai-models') # for local

In [5]:
# libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from utils.utils import (
    get_biggest_change_features, 
    neutralize,
    TARGET_COL,
    ERA_COL
)
import gc

---
## Get riskiest features

In [4]:
# get all training data
training = pd.read_parquet('data/train.parquet').dropna()

In [6]:
# get feature names
with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)

medium_features = feature_metadata["feature_sets"]["medium"]
medium_features_columns = medium_features + [TARGET_COL, ERA_COL]

small_features = feature_metadata["feature_sets"]["small"]
small_features_columns = small_features + [TARGET_COL, ERA_COL]

all = list(feature_metadata["feature_stats"].keys())
other_features = [x for x in all if x not in small_features and x not in medium_features]
other_features_columns = other_features + [TARGET_COL, ERA_COL]

In [7]:
# get 50 riskiest medium features
medium_feature_corrs = training[medium_features_columns].groupby(ERA_COL).apply(
    lambda era: era[medium_features].corrwith(era[TARGET_COL])
)

medium_features_riskiest_50 = get_biggest_change_features(medium_feature_corrs, 50)

In [11]:
# get 5 riskiest small features
small_features_corrs = training[small_features_columns].groupby(ERA_COL).apply(
    lambda era: era[small_features].corrwith(era[TARGET_COL])
)

small_features_riskiest_5 = get_biggest_change_features(small_features_corrs, 5)

In [12]:
# get 60 riskiest other features
other_feature_corrs = training[other_features_columns].groupby(ERA_COL).apply(
    lambda era: era[other_features].corrwith(era[TARGET_COL])
)

other_features_riskiest_60 = get_biggest_change_features(other_feature_corrs, 60)

In [16]:
# show lengthds of feature lists
print(len(medium_features_riskiest_50))
print(len(small_features_riskiest_5))
print(len(other_features_riskiest_60))

50
5
60


---
## Save feature lists as new dictionary

In [17]:
# save as features
riskiest_features = {
    "riskiest_50_medium_features" : medium_features_riskiest_50,
    "riskiest_5_small_features" : small_features_riskiest_5,
    "riskiest_60_other_features" : other_features_riskiest_60,
}

In [18]:
# save files
with open("data/riskiest_features.json", "w") as outfile:
    json.dump(riskiest_features, outfile, indent = 4)