In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from src.feature_selectors import GreedyGainSelector, remove_correlated_features
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
X = pd.read_csv('data/x_train.txt', sep=' ', header=None)
X.columns = ['x' + str(i) for i in range(500)]
y = pd.read_csv('data/y_train.txt', header=None)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_train = remove_correlated_features(X_train, threshold=0.75)

xgb = XGBClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, n_jobs=-1, random_state=42)

voting = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('rf', rf)], voting='soft')

ggs = GreedyGainSelector(estimator=voting, method='top_1', verbose=1, forward=True, logs=True, prefix="GreedyGainSelector")
ggs.fit(X_train, y_train)
ggs.print_support()

Continue from iteration 0
--------------------
Current best score: -inf
Current selected features: []
Head of scores dataframe:
    column   score
92   x101  1220.0
91   x100  1178.0
93   x102  1176.0
96   x105  1172.0
94   x103  1116.0
Selected column: x101
Selected column score: 1220.0
--------------------
Current best score: 1220.0
Current selected features: ['x101']
Head of scores dataframe:
     column   score
91    x100  1296.0
95    x105  1288.0
92    x102  1266.0
234   x244  1242.0
187   x197  1236.0
Selected column: x100
Selected column score: 1296.0
--------------------
Current best score: 1296.0
Current selected features: ['x101', 'x100']
Head of scores dataframe:
     column   score
94    x105  1356.0
91    x102  1338.0
259   x270  1316.0
0       x9  1306.0
371   x382  1298.0
Selected column: x105
Selected column score: 1356.0
--------------------
Current best score: 1356.0
Current selected features: ['x101', 'x100', 'x105']
Head of scores dataframe:
     column   score
91 

In [3]:
features = ggs.get_support()
with open(f'features/GreedyGainSelector_{voting.__class__.__name__}_{"top_1"}_{0.25}_{0.2}_{5}_{True}.pkl', 'wb') as f:
    pickle.dump(features, f)