In [59]:
import pandas as pd
import numpy as np


In [60]:
x_train = pd.read_csv('../../../data/x_train.txt', header=None, sep=' ')
y_train = pd.read_csv('../../../data/y_train.txt', header=None, sep=' ')
df_train = pd.concat([x_train, y_train], axis=1)
df_train.columns = [ 'x'+str(i) for i in range(1, df_train.shape[1]) ] + ['y']    

initial_features =  ['x1', 'x10', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x132', 'x140', 'x149', 'x153', 'x156', 'x176', 'x191', 'x2', 'x22', 'x221', 'x229', 'x253', 'x286', 'x3', 'x304', 'x322', 'x323', 'x324', 'x329', 'x336', 'x35', 'x352', 'x36', 'x4', 'x40', 'x404', 'x413', 'x423', 'x459', 'x463', 'x499', 'x5', 'x58', 'x6', 'x65', 'x7', 'x74', 'x8', 'x81', 'x9', 'x99']

df_train = df_train[initial_features + ['y']]


# Correlation Filter

In [61]:
# count correlation with y
correlation = df_train.corr()['y'].sort_values(ascending=False)

selected_features = abs(correlation).sort_values(ascending=False).index.tolist()
selected_features.remove('y')

# get top 50% features
selected_features = selected_features[:int(len(df_train.drop(columns=['y']).columns)//2)]

selected_features_corr = [f for f in selected_features if f != 'y']

print(sorted(selected_features_corr))

['x106', 'x149', 'x153', 'x156', 'x176', 'x22', 'x221', 'x253', 'x286', 'x304', 'x322', 'x324', 'x329', 'x336', 'x352', 'x36', 'x404', 'x413', 'x459', 'x499', 'x58', 'x65', 'x81', 'x99']


# Mutual Information

In [62]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

X = df_train.drop('y', axis=1)
y = df_train['y']

selector = SelectKBest(mutual_info_classif, k=int(len(df_train.drop(columns=['y']).columns)//2))
selector.fit(X, y)

selected_features_mi = X.columns[selector.get_support()].tolist()

print(sorted(selected_features_mi))

['x1', 'x101', 'x102', 'x103', 'x104', 'x106', 'x132', 'x176', 'x191', 'x322', 'x324', 'x329', 'x336', 'x35', 'x352', 'x4', 'x40', 'x413', 'x459', 'x463', 'x5', 'x6', 'x7', 'x9']


# Variance Threshold

In [63]:
from sklearn.feature_selection import VarianceThreshold

X = df_train.drop('y', axis=1)
y = df_train['y']

selector = VarianceThreshold(threshold=1)
X_variance = selector.fit_transform(X)

selected_features_variance = X.columns[selector.get_support()].tolist()

print(sorted(selected_features_variance))

['x1', 'x10', 'x102', 'x104', 'x106', 'x132', 'x140', 'x149', 'x156', 'x176', 'x2', 'x22', 'x3', 'x4', 'x404', 'x413', 'x423', 'x459', 'x463', 'x499', 'x5', 'x6', 'x65', 'x7', 'x8', 'x81', 'x9', 'x99']


# Intersect

In [64]:
intersection = set(selected_features_corr) & set(selected_features_mi) & set(selected_features_variance)
print(sorted(intersection))

['x106', 'x176', 'x413', 'x459']


In [65]:
# save selected features
selected_features = sorted(intersection)
selected_features = {
    'selected_features': selected_features,
    'method': f'intersection of top 50% correlation, 50% top mutual information and variance threshold {1}'
}

# save to json
import json

with open('results/4_filters_selected_features.json', 'w') as f:
    json.dump(selected_features, f)