# Downloading the Datasets

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd gdrive/My Drive/Colab Notebooks/Numerai

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/Numerai


In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore")

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

In [4]:
training_df = pd.read_csv('Data/numerai_training_data.csv')
validation_df = pd.read_csv('Data/numerai_validation_data.csv')

TARGET_NAME = f"target_kazutsugi"
PREDICTION_NAME = f"prediction"
feature_names = [f for f in training_df.columns if f.startswith("feature")]

training_df[feature_names] = training_df[feature_names].astype(np.float16)
training_df[TARGET_NAME] = training_df[TARGET_NAME].astype(np.float16)
validation_df[feature_names] = validation_df[feature_names].astype(np.float16)
validation_df[TARGET_NAME] = validation_df[TARGET_NAME].astype(np.float16)

In [5]:
def score(df):
    pct_ranks = df[PREDICTION_NAME].rank(pct=True, method="first")
    targets = df[TARGET_NAME]
    return np.corrcoef(targets, pct_ranks)[0, 1]

# Final features XGBoost
(n=8, n=11)

In [9]:
train_df = training_df.copy(deep=True)
val_df = validation_df.copy(deep=True)

## PCA

In [7]:
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [10]:
def add_n_extra_features(n, train_df, val_df):
  pca = PCA(n_components=n)
  pca_result = pca.fit_transform(train_df[feature_names])
  val_pca_result = pca.transform(val_df[feature_names])

  for j in range(n):
    train_df[f"feature_pcaencoding{j}"] = pca_result[:,j]
    val_df[f"feature_pcaencoding{j}"] = val_pca_result[:,j]
  
  return train_df, val_df

In [11]:
train_df, val_df = add_n_extra_features(8, train_df, val_df)

## Autoencoding

In [8]:
from keras.models import Model
from keras.layers import Dense, Input, concatenate, Dropout
from keras.regularizers import l2

import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [13]:
def add_n_extra_features(n, train_df, val_df):
  encoding_dim = n

  input_df = Input(shape=(310,))
  encoded = Dense(256, activation='relu')(input_df)
  encoded = Dense(128, activation='relu')(encoded)
  encoded = Dense(64, activation='relu')(encoded)
  encoded = Dense(16, activation='relu')(encoded)
  encoded = Dense(n, activation='sigmoid')(encoded)
  decoded = Dense(16, activation='relu')(encoded)
  decoded = Dense(64, activation='relu')(decoded)
  decoded = Dense(128, activation='relu')(decoded)
  decoded = Dense(256, activation='relu')(decoded)
  decoded = Dense(310, activation='sigmoid')(decoded)

  autoencoder = Model(input_df, decoded)
  encoder = Model(input_df, encoded)

  autoencoder.compile(optimizer='adam', loss='mean_squared_error')

  autoencoder.fit(train_df[feature_names], train_df[feature_names],
                epochs=40,
                batch_size=2048,
                shuffle=True,
                verbose=0)

  encoded_train = encoder.predict(train_df[feature_names])
  encoded_val = encoder.predict(val_df[feature_names])

  for j in range(n):
    train_df[f"feature_autoencoding{j}"] = encoded_train[:,j]
    val_df[f"feature_autoencoding{j}"] = encoded_val[:,j]
  
  return train_df, val_df

In [14]:
train_df, val_df = add_n_extra_features(11, train_df, val_df)

## Save df

In [15]:
train_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,feature_wisdom27,feature_wisdom28,feature_wisdom29,feature_wisdom30,feature_wisdom31,feature_wisdom32,feature_wisdom33,feature_wisdom34,feature_wisdom35,feature_wisdom36,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi,feature_pcaencoding0,feature_pcaencoding1,feature_pcaencoding2,feature_pcaencoding3,feature_pcaencoding4,feature_pcaencoding5,feature_pcaencoding6,feature_pcaencoding7,feature_autoencoding0,feature_autoencoding1,feature_autoencoding2,feature_autoencoding3,feature_autoencoding4,feature_autoencoding5,feature_autoencoding6,feature_autoencoding7,feature_autoencoding8,feature_autoencoding9,feature_autoencoding10
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,0.25,0.25,1.0,0.75,0.5,1.0,0.5,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.25,0.0,0.5,0.25,0.75,0.5,1.0,0.75,0.75,0.5,0.5,0.75,0.5,...,1.0,0.75,0.5,0.5,1.0,0.25,0.5,0.5,0.5,0.75,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.75,-0.349383,-1.441544,-1.784221,-0.069848,1.732101,-0.401702,-0.81169,0.61983,0.525866,0.544893,0.518022,0.364337,0.27333,0.612157,0.584791,0.704958,0.429479,0.39757,0.537267
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.25,0.25,0.5,0.0,1.0,0.5,0.5,0.5,0.75,0.5,0.5,0.75,0.25,0.5,0.75,0.5,0.25,0.75,0.5,...,0.75,1.0,0.25,0.25,1.0,0.5,0.5,0.5,0.75,0.75,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25,-0.695059,-2.199356,0.393313,-0.565352,-0.88764,1.098592,-0.504342,-1.579269,0.430178,0.328498,0.433338,0.33719,0.297101,0.844717,0.571642,0.449587,0.421337,0.701694,0.667045
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,0.25,0.0,0.25,0.5,1.0,0.5,0.75,0.5,0.5,1.0,0.5,0.5,0.5,0.25,0.0,0.25,0.75,0.75,0.75,0.5,0.75,0.5,0.25,0.5,0.75,0.25,0.5,0.5,0.75,0.5,...,0.75,0.0,1.0,0.5,0.5,0.75,1.0,0.75,1.0,0.25,0.5,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.0,0.907658,-0.358365,-0.935118,0.446002,-1.2279,-1.925119,-0.954609,-0.624884,0.679789,0.659405,0.371719,0.316489,0.563163,0.58114,0.51282,0.625947,0.320122,0.685917,0.678815
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,0.75,0.25,0.5,0.5,0.5,0.75,0.5,1.0,0.5,0.5,0.0,1.0,0.0,0.75,0.0,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.75,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,0.75,1.0,0.75,1.0,1.0,0.0,0.5,0.75,0.75,1.0,0.75,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.0,1.598843,-1.013379,-1.560695,-2.089551,-0.163186,-0.796688,-0.74724,-0.019883,0.465539,0.477262,0.366082,0.443915,0.343933,0.658179,0.240033,0.704373,0.337026,0.553835,0.476238
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,0.25,0.25,0.5,0.25,0.25,0.75,0.5,0.0,0.5,0.5,0.25,0.0,0.5,0.0,0.5,0.25,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,0.75,0.5,0.75,0.25,0.75,0.5,0.5,0.25,0.25,0.75,0.5,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75,-0.088821,-1.74169,-0.763098,-0.403356,-0.009025,1.188055,-0.386061,0.86446,0.443174,0.356642,0.531192,0.343516,0.353477,0.676516,0.428447,0.583248,0.481688,0.516346,0.502558


In [16]:
val_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,feature_wisdom27,feature_wisdom28,feature_wisdom29,feature_wisdom30,feature_wisdom31,feature_wisdom32,feature_wisdom33,feature_wisdom34,feature_wisdom35,feature_wisdom36,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi,feature_pcaencoding0,feature_pcaencoding1,feature_pcaencoding2,feature_pcaencoding3,feature_pcaencoding4,feature_pcaencoding5,feature_pcaencoding6,feature_pcaencoding7,feature_autoencoding0,feature_autoencoding1,feature_autoencoding2,feature_autoencoding3,feature_autoencoding4,feature_autoencoding5,feature_autoencoding6,feature_autoencoding7,feature_autoencoding8,feature_autoencoding9,feature_autoencoding10
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,0.25,0.5,0.5,0.25,0.0,0.25,0.5,0.25,0.0,0.25,1.0,1.0,0.25,1.0,1.0,0.25,0.25,0.0,0.5,0.25,0.75,0.0,0.5,0.25,0.25,0.25,0.5,0.0,0.5,1.0,...,0.5,0.25,0.0,0.25,0.5,0.25,0.5,0.25,0.25,1.0,0.75,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.0,0.120206,1.55112,1.712235,-0.66977,-2.248873,1.187683,0.441092,-0.430071,0.289276,0.384547,0.382106,0.385016,0.539111,0.654335,0.24837,0.184573,0.533881,0.622801,0.627099
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,0.75,0.25,0.0,0.75,0.5,0.0,0.25,0.5,0.0,1.0,0.25,0.25,1.0,1.0,0.25,0.75,0.0,0.0,0.75,1.0,1.0,0.0,0.25,0.0,0.0,0.25,0.25,0.25,0.0,1.0,...,0.25,0.5,0.25,0.5,0.5,0.5,0.5,0.25,0.25,0.75,0.5,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.25,2.527598,0.106757,-1.088938,-0.823978,1.176287,2.246339,1.333658,0.551338,0.331852,0.401299,0.73996,0.392725,0.246025,0.47438,0.344525,0.59606,0.515408,0.624127,0.694898
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.75,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,0.5,1.0,0.5,1.0,0.5,1.0,0.25,1.0,1.0,1.0,0.5,1.0,1.0,0.75,1.0,...,0.0,0.0,0.0,0.5,0.0,0.75,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.5,0.0,1.0,-1.336355,3.505574,1.418149,0.621056,0.693367,-0.194254,1.031963,-2.353089,0.201501,0.313064,0.394394,0.268587,0.478782,0.345278,0.52697,0.456296,0.790289,0.574499,0.667246
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,0.5,0.75,1.0,0.75,0.0,0.5,0.0,0.5,0.75,0.5,0.75,0.25,0.75,0.25,0.75,0.25,0.75,1.0,0.5,0.5,0.75,0.5,1.0,0.5,0.25,0.75,0.25,0.75,0.25,0.75,...,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.75,-1.564395,3.155077,0.842485,0.456792,-1.635734,1.184766,0.369005,1.118786,0.223846,0.455838,0.455837,0.43761,0.549473,0.693071,0.369458,0.320194,0.51313,0.401977,0.691888
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,0.25,0.25,0.5,0.5,1.0,1.0,1.0,1.0,0.75,0.5,0.5,0.5,0.75,0.0,0.0,0.0,0.25,0.0,0.0,0.75,0.25,1.0,0.25,1.0,0.75,0.0,1.0,0.75,0.75,0.75,...,0.0,0.0,0.25,0.25,0.75,1.0,1.0,0.75,0.75,0.5,0.5,0.5,0.75,0.0,0.0,0.75,1.0,0.0,0.25,1.0,1.0,0.931153,0.517618,-0.132495,-0.730126,0.158312,-0.378191,-2.346678,-0.189877,0.540932,0.586559,0.465905,0.443415,0.516665,0.756642,0.501607,0.72162,0.545687,0.70348,0.678641


In [18]:
train_df.to_csv('Data/train_df_with_encodings_xgb.csv', index=False)
val_df.to_csv('Data/val_df_with_encodings_xgb.csv', index=False)

# Final features LGBM
(n=3, n=6)

In [6]:
train_df = training_df.copy(deep=True)
val_df = validation_df.copy(deep=True)

## PCA

In [11]:
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [12]:
def add_n_extra_features(n, train_df, val_df):
  pca = PCA(n_components=n)
  pca_result = pca.fit_transform(train_df[feature_names])
  val_pca_result = pca.transform(val_df[feature_names])

  for j in range(n):
    train_df[f"feature_pcaencoding{j}"] = pca_result[:,j]
    val_df[f"feature_pcaencoding{j}"] = val_pca_result[:,j]
  
  return train_df, val_df

In [13]:
train_df, val_df = add_n_extra_features(3, train_df, val_df)

## Autoencoding

In [14]:
from keras.models import Model
from keras.layers import Dense, Input, concatenate, Dropout
from keras.regularizers import l2

import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [15]:
def add_n_extra_features(n, train_df, val_df):
  encoding_dim = n

  input_df = Input(shape=(310,))
  encoded = Dense(256, activation='relu')(input_df)
  encoded = Dense(128, activation='relu')(encoded)
  encoded = Dense(64, activation='relu')(encoded)
  encoded = Dense(16, activation='relu')(encoded)
  encoded = Dense(n, activation='sigmoid')(encoded)
  decoded = Dense(16, activation='relu')(encoded)
  decoded = Dense(64, activation='relu')(decoded)
  decoded = Dense(128, activation='relu')(decoded)
  decoded = Dense(256, activation='relu')(decoded)
  decoded = Dense(310, activation='sigmoid')(decoded)

  autoencoder = Model(input_df, decoded)
  encoder = Model(input_df, encoded)

  autoencoder.compile(optimizer='adam', loss='mean_squared_error')

  autoencoder.fit(train_df[feature_names], train_df[feature_names],
                epochs=40,
                batch_size=2048,
                shuffle=True,
                verbose=0)

  encoded_train = encoder.predict(train_df[feature_names])
  encoded_val = encoder.predict(val_df[feature_names])

  for j in range(n):
    train_df[f"feature_autoencoding{j}"] = encoded_train[:,j]
    val_df[f"feature_autoencoding{j}"] = encoded_val[:,j]
  
  return train_df, val_df

In [16]:
train_df, val_df = add_n_extra_features(6, train_df, val_df)

## Save df

In [17]:
train_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,feature_wisdom17,feature_wisdom18,feature_wisdom19,feature_wisdom20,feature_wisdom21,feature_wisdom22,feature_wisdom23,feature_wisdom24,feature_wisdom25,feature_wisdom26,feature_wisdom27,feature_wisdom28,feature_wisdom29,feature_wisdom30,feature_wisdom31,feature_wisdom32,feature_wisdom33,feature_wisdom34,feature_wisdom35,feature_wisdom36,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi,feature_pcaencoding0,feature_pcaencoding1,feature_pcaencoding2,feature_autoencoding0,feature_autoencoding1,feature_autoencoding2,feature_autoencoding3,feature_autoencoding4,feature_autoencoding5
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,0.25,0.25,1.0,0.75,0.5,1.0,0.5,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.25,0.0,0.5,0.25,0.75,0.5,1.0,0.75,0.75,0.5,0.5,0.75,0.5,...,0.25,0.25,0.75,0.5,1.0,0.5,0.75,0.75,0.25,0.5,1.0,0.75,0.5,0.5,1.0,0.25,0.5,0.5,0.5,0.75,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.75,-0.349382,-1.441543,-1.784219,0.30388,0.216762,0.382686,0.445277,0.478433,0.246845
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.25,0.25,0.5,0.0,1.0,0.5,0.5,0.5,0.75,0.5,0.5,0.75,0.25,0.5,0.75,0.5,0.25,0.75,0.5,...,0.25,1.0,0.5,1.0,1.0,0.5,0.5,0.5,1.0,0.25,0.75,1.0,0.25,0.25,1.0,0.5,0.5,0.5,0.75,0.75,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25,-0.69506,-2.19935,0.3933,0.553677,0.248747,0.65003,0.600467,0.637539,0.31528
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,0.25,0.0,0.25,0.5,1.0,0.5,0.75,0.5,0.5,1.0,0.5,0.5,0.5,0.25,0.0,0.25,0.75,0.75,0.75,0.5,0.75,0.5,0.25,0.5,0.75,0.25,0.5,0.5,0.75,0.5,...,1.0,0.0,1.0,1.0,0.5,1.0,0.75,1.0,0.0,0.5,0.75,0.0,1.0,0.5,0.5,0.75,1.0,0.75,1.0,0.25,0.5,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.0,0.907658,-0.358362,-0.93514,0.392139,0.438776,0.405294,0.550879,0.50891,0.280421
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,0.75,0.25,0.5,0.5,0.5,0.75,0.5,1.0,0.5,0.5,0.0,1.0,0.0,0.75,0.0,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.75,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,0.75,1.0,0.75,1.0,0.75,1.0,0.0,0.5,0.75,1.0,0.75,1.0,0.75,1.0,1.0,0.0,0.5,0.75,0.75,1.0,0.75,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.0,1.598843,-1.013379,-1.560685,0.377822,0.453177,0.639378,0.510745,0.529151,0.255674
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,0.25,0.25,0.5,0.25,0.25,0.75,0.5,0.0,0.5,0.5,0.25,0.0,0.5,0.0,0.5,0.25,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,1.0,0.5,0.75,0.25,0.5,0.0,0.5,0.5,0.5,0.75,0.75,0.5,0.75,0.25,0.75,0.5,0.5,0.25,0.25,0.75,0.5,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75,-0.088821,-1.7417,-0.763088,0.438423,0.25416,0.595695,0.372865,0.601093,0.328376


In [18]:
val_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,feature_wisdom17,feature_wisdom18,feature_wisdom19,feature_wisdom20,feature_wisdom21,feature_wisdom22,feature_wisdom23,feature_wisdom24,feature_wisdom25,feature_wisdom26,feature_wisdom27,feature_wisdom28,feature_wisdom29,feature_wisdom30,feature_wisdom31,feature_wisdom32,feature_wisdom33,feature_wisdom34,feature_wisdom35,feature_wisdom36,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi,feature_pcaencoding0,feature_pcaencoding1,feature_pcaencoding2,feature_autoencoding0,feature_autoencoding1,feature_autoencoding2,feature_autoencoding3,feature_autoencoding4,feature_autoencoding5
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,0.25,0.5,0.5,0.25,0.0,0.25,0.5,0.25,0.0,0.25,1.0,1.0,0.25,1.0,1.0,0.25,0.25,0.0,0.5,0.25,0.75,0.0,0.5,0.25,0.25,0.25,0.5,0.0,0.5,1.0,...,0.25,0.5,0.5,0.5,0.5,0.0,0.25,0.75,0.25,0.25,0.5,0.25,0.0,0.25,0.5,0.25,0.5,0.25,0.25,1.0,0.75,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.0,0.120206,1.551119,1.712246,0.664974,0.503687,0.544194,0.665161,0.43286,0.257939
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,0.75,0.25,0.0,0.75,0.5,0.0,0.25,0.5,0.0,1.0,0.25,0.25,1.0,1.0,0.25,0.75,0.0,0.0,0.75,1.0,1.0,0.0,0.25,0.0,0.0,0.25,0.25,0.25,0.0,1.0,...,0.25,1.0,0.25,0.0,0.5,0.75,0.75,0.5,1.0,1.0,0.25,0.5,0.25,0.5,0.5,0.5,0.5,0.25,0.25,0.75,0.5,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.25,2.527598,0.106757,-1.088935,0.504223,0.414093,0.620324,0.381888,0.478437,0.410321
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.75,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,0.5,1.0,0.5,1.0,0.5,1.0,0.25,1.0,1.0,1.0,0.5,1.0,1.0,0.75,1.0,...,0.0,0.0,0.0,0.25,0.0,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.75,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.5,0.0,1.0,-1.336355,3.505574,1.418144,0.471273,0.398253,0.035018,0.535121,0.219912,0.180064
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,0.5,0.75,1.0,0.75,0.0,0.5,0.0,0.5,0.75,0.5,0.75,0.25,0.75,0.25,0.75,0.25,0.75,1.0,0.5,0.5,0.75,0.5,1.0,0.5,0.25,0.75,0.25,0.75,0.25,0.75,...,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.75,-1.564395,3.15508,0.842473,0.67136,0.414074,0.448732,0.564994,0.389353,0.192545
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,0.25,0.25,0.5,0.5,1.0,1.0,1.0,1.0,0.75,0.5,0.5,0.5,0.75,0.0,0.0,0.0,0.25,0.0,0.0,0.75,0.25,1.0,0.25,1.0,0.75,0.0,1.0,0.75,0.75,0.75,...,0.25,0.0,0.25,0.5,0.25,1.0,1.0,1.0,0.0,0.25,0.0,0.0,0.25,0.25,0.75,1.0,1.0,0.75,0.75,0.5,0.5,0.5,0.75,0.0,0.0,0.75,1.0,0.0,0.25,1.0,1.0,0.931153,0.517618,-0.1325,0.494719,0.402332,0.607292,0.548308,0.442163,0.262241


In [19]:
train_df.to_csv('Data/train_df_with_encodings_lgbm.csv', index=False)
val_df.to_csv('Data/val_df_with_encodings_lgbm.csv', index=False)

# Prepare Final Dataframe

In [6]:
reduced_train_df = pd.read_csv('Data/corr_agg_training_data.csv')
reduced_val_df = pd.read_csv('Data/corr_agg_validation_data.csv')

red_feature_names = [f for f in reduced_train_df.columns if f.startswith("feature")]

In [19]:
xgb_training_df = pd.read_csv('Data/train_df_with_encodings_xgb.csv')
xgb_validation_df = pd.read_csv('Data/val_df_with_encodings_xgb.csv')

xgb_feature_names = [f for f in xgb_training_df.columns if (f.startswith("feature_pca") or f.startswith("feature_auto"))]

In [20]:
lgbm_training_df = pd.read_csv('Data/train_df_with_encodings_lgbm.csv')
lgbm_validation_df = pd.read_csv('Data/val_df_with_encodings_lgbm.csv')

lgbm_feature_names = [f for f in lgbm_training_df.columns if (f.startswith("feature_pca") or f.startswith("feature_auto"))]

In [23]:
for i in range(len(red_feature_names)):
  training_df[f'reduced_feature_{i}'] = reduced_train_df[red_feature_names[i]]
  validation_df[f'reduced_feature_{i}'] = reduced_val_df[red_feature_names[i]]

In [27]:
for i in range(len(xgb_feature_names)):
  training_df[f'xgb_feature_{i}'] = xgb_training_df[xgb_feature_names[i]]
  validation_df[f'xgb_feature_{i}'] = xgb_validation_df[xgb_feature_names[i]]

In [30]:
for i in range(len(lgbm_feature_names)):
  training_df[f'lgbm_feature_{i}'] = lgbm_training_df[lgbm_feature_names[i]]
  validation_df[f'lgbm_feature_{i}'] = lgbm_validation_df[lgbm_feature_names[i]]

In [32]:
training_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,reduced_feature_188,reduced_feature_189,reduced_feature_190,reduced_feature_191,reduced_feature_192,reduced_feature_193,reduced_feature_194,reduced_feature_195,reduced_feature_196,reduced_feature_197,reduced_feature_198,reduced_feature_199,xgb_feature_0,xgb_feature_1,xgb_feature_2,xgb_feature_3,xgb_feature_4,xgb_feature_5,xgb_feature_6,xgb_feature_7,xgb_feature_8,xgb_feature_9,xgb_feature_10,xgb_feature_11,xgb_feature_12,xgb_feature_13,xgb_feature_14,xgb_feature_15,xgb_feature_16,xgb_feature_17,xgb_feature_18,lgbm_feature_0,lgbm_feature_1,lgbm_feature_2,lgbm_feature_3,lgbm_feature_4,lgbm_feature_5,lgbm_feature_6,lgbm_feature_7,lgbm_feature_8
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,0.25,0.25,1.0,0.75,0.5,1.0,0.5,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.25,0.0,0.5,0.25,0.75,0.5,1.0,0.75,0.75,0.5,0.5,0.75,0.5,...,0.375,1.0,0.25,0.625,0.5,0.25,0.125,0.25,0.875,0.0,0.5,0.25,-0.349383,-1.441544,-1.784221,-0.069848,1.732101,-0.401702,-0.81169,0.61983,0.525866,0.544893,0.518023,0.364337,0.27333,0.612157,0.584791,0.704958,0.429479,0.39757,0.537267,-0.349382,-1.441543,-1.784219,0.30388,0.216762,0.382686,0.445277,0.478433,0.246845
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.25,0.25,0.5,0.0,1.0,0.5,0.5,0.5,0.75,0.5,0.5,0.75,0.25,0.5,0.75,0.5,0.25,0.75,0.5,...,0.5,0.25,1.0,0.875,0.375,0.75,0.75,0.75,0.125,0.75,0.75,1.0,-0.695059,-2.199356,0.393313,-0.565352,-0.88764,1.098592,-0.504342,-1.579269,0.430178,0.328498,0.433338,0.33719,0.297101,0.844717,0.571642,0.449587,0.421337,0.701694,0.667045,-0.69506,-2.19935,0.3933,0.553677,0.248747,0.65003,0.600467,0.637539,0.31528
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,0.25,0.0,0.25,0.5,1.0,0.5,0.75,0.5,0.5,1.0,0.5,0.5,0.5,0.25,0.0,0.25,0.75,0.75,0.75,0.5,0.75,0.5,0.25,0.5,0.75,0.25,0.5,0.5,0.75,0.5,...,0.5625,1.0,0.0,0.25,0.625,0.25,0.375,0.25,0.0,0.0,0.0,0.25,0.907658,-0.358365,-0.935118,0.446002,-1.2279,-1.925119,-0.954609,-0.624884,0.679789,0.659405,0.371719,0.316489,0.563163,0.58114,0.51282,0.625947,0.320122,0.685917,0.678815,0.907658,-0.358362,-0.93514,0.392139,0.438776,0.405294,0.550879,0.50891,0.280421
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,0.75,0.25,0.5,0.5,0.5,0.75,0.5,1.0,0.5,0.5,0.0,1.0,0.0,0.75,0.0,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.75,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,0.9375,0.5,0.25,0.375,1.0,0.25,0.125,0.75,0.625,0.25,0.375,0.25,1.598843,-1.013379,-1.560695,-2.089551,-0.163186,-0.796688,-0.74724,-0.019883,0.465539,0.477262,0.366082,0.443915,0.343933,0.658179,0.240033,0.704373,0.337026,0.553835,0.476238,1.598843,-1.013379,-1.560685,0.377822,0.453177,0.639378,0.510745,0.529151,0.255674
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,0.25,0.25,0.5,0.25,0.25,0.75,0.5,0.0,0.5,0.5,0.25,0.0,0.5,0.0,0.5,0.25,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.75,0.5,0.25,0.5,0.5,0.5,0.5,...,0.875,0.125,1.0,1.0,0.0,0.75,0.25,1.0,0.0,0.0,1.0,0.125,-0.088821,-1.74169,-0.763098,-0.403356,-0.009025,1.188055,-0.386061,0.86446,0.443174,0.356642,0.531192,0.343516,0.353477,0.676516,0.428447,0.583248,0.481688,0.516346,0.502558,-0.088821,-1.7417,-0.763088,0.438423,0.25416,0.595695,0.372865,0.601093,0.328376


In [33]:
validation_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,feature_intelligence11,feature_intelligence12,feature_charisma1,feature_charisma2,feature_charisma3,feature_charisma4,feature_charisma5,feature_charisma6,feature_charisma7,feature_charisma8,feature_charisma9,feature_charisma10,feature_charisma11,feature_charisma12,feature_charisma13,feature_charisma14,feature_charisma15,feature_charisma16,feature_charisma17,feature_charisma18,feature_charisma19,feature_charisma20,feature_charisma21,feature_charisma22,feature_charisma23,feature_charisma24,feature_charisma25,...,reduced_feature_188,reduced_feature_189,reduced_feature_190,reduced_feature_191,reduced_feature_192,reduced_feature_193,reduced_feature_194,reduced_feature_195,reduced_feature_196,reduced_feature_197,reduced_feature_198,reduced_feature_199,xgb_feature_0,xgb_feature_1,xgb_feature_2,xgb_feature_3,xgb_feature_4,xgb_feature_5,xgb_feature_6,xgb_feature_7,xgb_feature_8,xgb_feature_9,xgb_feature_10,xgb_feature_11,xgb_feature_12,xgb_feature_13,xgb_feature_14,xgb_feature_15,xgb_feature_16,xgb_feature_17,xgb_feature_18,lgbm_feature_0,lgbm_feature_1,lgbm_feature_2,lgbm_feature_3,lgbm_feature_4,lgbm_feature_5,lgbm_feature_6,lgbm_feature_7,lgbm_feature_8
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,0.25,0.5,0.5,0.25,0.0,0.25,0.5,0.25,0.0,0.25,1.0,1.0,0.25,1.0,1.0,0.25,0.25,0.0,0.5,0.25,0.75,0.0,0.5,0.25,0.25,0.25,0.5,0.0,0.5,1.0,...,0.25,0.375,0.25,0.25,1.0,1.0,1.0,1.0,0.125,0.625,0.25,0.75,0.120206,1.55112,1.712235,-0.66977,-2.248873,1.187683,0.441092,-0.430071,0.289276,0.384547,0.382106,0.385016,0.539111,0.654335,0.24837,0.184573,0.533881,0.622801,0.627099,0.120206,1.551119,1.712246,0.664974,0.503687,0.544194,0.665161,0.43286,0.257939
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,0.75,0.25,0.0,0.75,0.5,0.0,0.25,0.5,0.0,1.0,0.25,0.25,1.0,1.0,0.25,0.75,0.0,0.0,0.75,1.0,1.0,0.0,0.25,0.0,0.0,0.25,0.25,0.25,0.0,1.0,...,0.0,0.0,1.0,0.25,1.0,0.25,0.0,0.875,0.0,0.0,0.25,0.0,2.527598,0.106757,-1.088938,-0.823978,1.176287,2.246339,1.333658,0.551338,0.331852,0.401299,0.73996,0.392725,0.246025,0.47438,0.344525,0.59606,0.515408,0.624127,0.694898,2.527598,0.106757,-1.088935,0.504223,0.414093,0.620324,0.381888,0.478437,0.410321
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.75,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,0.5,1.0,0.5,1.0,0.5,1.0,0.25,1.0,1.0,1.0,0.5,1.0,1.0,0.75,1.0,...,0.0,0.25,0.75,1.0,0.5,0.5,1.0,0.0,0.5,1.0,1.0,1.0,-1.336355,3.505574,1.418149,0.621056,0.693367,-0.194254,1.031963,-2.353089,0.201501,0.313064,0.394395,0.268587,0.478782,0.345278,0.52697,0.456296,0.790289,0.574499,0.667246,-1.336355,3.505574,1.418144,0.471273,0.398253,0.035018,0.535121,0.219912,0.180064
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,0.5,0.75,1.0,0.75,0.0,0.5,0.0,0.5,0.75,0.5,0.75,0.25,0.75,0.25,0.75,0.25,0.75,1.0,0.5,0.5,0.75,0.5,1.0,0.5,0.25,0.75,0.25,0.75,0.25,0.75,...,0.3125,0.875,0.75,0.0,0.5,0.875,0.875,0.875,0.75,0.25,0.0,1.0,-1.564395,3.155077,0.842485,0.456792,-1.635734,1.184766,0.369005,1.118786,0.223846,0.455838,0.455837,0.43761,0.549473,0.693071,0.369459,0.320194,0.51313,0.401977,0.691888,-1.564395,3.15508,0.842473,0.67136,0.414074,0.448732,0.564994,0.389353,0.192545
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,0.25,0.25,0.5,0.5,1.0,1.0,1.0,1.0,0.75,0.5,0.5,0.5,0.75,0.0,0.0,0.0,0.25,0.0,0.0,0.75,0.25,1.0,0.25,1.0,0.75,0.0,1.0,0.75,0.75,0.75,...,1.0,0.0,0.0,0.75,0.75,0.375,0.25,0.625,0.75,0.375,0.625,0.0,0.931153,0.517618,-0.132495,-0.730126,0.158312,-0.378191,-2.346678,-0.189877,0.540932,0.586559,0.465905,0.443415,0.516665,0.756642,0.501607,0.72162,0.545687,0.70348,0.678641,0.931153,0.517618,-0.1325,0.494719,0.402332,0.607292,0.548308,0.442163,0.262241


In [34]:
training_df.to_csv('Data/final_training_data.csv', index=False)
validation_df.to_csv('Data/final_validation_data.csv', index=False)