In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from src.feature_selectors import GreedyGainSelector, remove_correlated_features
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
X = pd.read_csv('data/x_train.txt', sep=' ', header=None)
X.columns = ['x' + str(i) for i in range(500)]
y = pd.read_csv('data/y_train.txt', header=None)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_train = remove_correlated_features(X_train, threshold=0.75)

xgb = XGBClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)

voting = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('rf', rf)], voting='soft')

ggs = GreedyGainSelector(estimator=voting, method='random_improvement', verbose=1, forward=True, logs=True, prefix="GreedyGainSelector")
ggs.fit(X_train, y_train)
ggs.print_support()

Continue from iteration 2
--------------------
Current best score: 1296.0
Current selected features: ['x101', 'x102']
Head of scores dataframe:
     column   score
91    x100  1340.0
94    x105  1330.0
195   x206  1298.0
160   x171  1296.0
92    x103  1296.0
Selected column: x100
Selected column score: 1340.0
--------------------
Current best score: 1340.0
Current selected features: ['x101', 'x102', 'x100']
Head of scores dataframe:
     column   score
93    x105  1362.0
281   x293  1350.0
8      x17  1342.0
106   x118  1340.0
482   x494  1340.0
Selected column: x105
Selected column score: 1362.0
--------------------
Current best score: 1362.0
Current selected features: ['x101', 'x102', 'x100', 'x105']
Head of scores dataframe:
     column   score
86     x95  1364.0
361   x374  1364.0
257   x270  1364.0
405   x418  1362.0
433   x446  1362.0
Selected column: x270
Selected column score: 1364.0
--------------------
Current best score: 1364.0
Current selected features: ['x101', 'x102', 'x1

In [3]:
features = ggs.get_support()
with open(f'features/GreedyGainSelector_{voting.__class__.__name__}_{"random_improvement"}_{0.25}_{0.2}_{5}_{True}.pkl', 'wb') as f:
    pickle.dump(features, f)