In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from src.feature_selectors import GreedyGainSelector, BorutaSelector, remove_correlated_features
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
X = pd.read_csv('data/x_train.txt', sep=' ', header=None)
X.columns = ['x' + str(i) for i in range(500)]
y = pd.read_csv('data/y_train.txt', header=None)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_train = remove_correlated_features(X_train, threshold=0.75)

xgb = XGBClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)

voting = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('rf', rf)], voting='soft')
bs = BorutaSelector(estimator=rf, random_state=42)
X_train = bs.fit_transform(X_train, y_train)
bs.print_support()
ggs = GreedyGainSelector(estimator=voting, method='random_improvement', verbose=1, forward=True, logs=True, prefix="BorutaSelector_GreedyGainSelector")
ggs.fit(X_train, y_train)
ggs.print_support()

Number of features: 7
Features: ['x9', 'x100', 'x101', 'x102', 'x103', 'x104', 'x105']
--------------------
Current best score: -inf
Current selected features: []
Head of scores dataframe:
   column   score
2   x101  1214.0
1   x100  1174.0
3   x102  1170.0
6   x105  1168.0
0     x9  1112.0
Selected column: x101
Selected column score: 1214.0
--------------------
Current best score: 1214.0
Current selected features: ['x101']
Head of scores dataframe:
   column   score
1   x100  1294.0
5   x105  1288.0
2   x102  1270.0
4   x104  1228.0
0     x9  1226.0
Selected column: x100
Selected column score: 1294.0
--------------------
Current best score: 1294.0
Current selected features: ['x101', 'x100']
Head of scores dataframe:
   column   score
4   x105  1350.0
1   x102  1330.0
0     x9  1306.0
2   x103  1294.0
3   x104  1292.0
Selected column: x102
Selected column score: 1330.0
--------------------
Current best score: 1330.0
Current selected features: ['x101', 'x100', 'x102']
Head of scores dat

In [3]:
features = ggs.get_support()
with open(f'features/BorutaSelector_GreedyGainSelector_{voting.__class__.__name__}_{"random_improvement"}_{0.25}_{0.2}_{5}_{True}.pkl', 'wb') as f:
    pickle.dump(features, f)