In [14]:
import pandas as pd
# pd.reset_option("all")
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from tqdm import tqdm
import xgboost as xgb

In [9]:
df = pd.read_csv('final_df.csv', index_col = [0])

In [10]:
feature_cols = [col for col in df.columns if col != 'target']
X = df[feature_cols]
y = df['target']

In [16]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [22]:
y_train

array([2, 1, 0, ..., 0, 0, 1])

In [None]:
class TqdmScore:
    def __init__(self, n_splits):
        self.bar = tqdm(total=n_splits)
        
    def __call__(self, estimator, X, y):
        score = accuracy_score(y, estimator.predict(X))
        self.bar.update()
        return score

def select_features(X, y, model):
    feature_sets = {}

    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2),
                  scoring='accuracy',
                  min_features_to_select=30)
    rfecv.fit(X, y)

    selected_features_rfe = [f for f, s in zip(X.columns, rfecv.support_) if s]
    feature_sets['rfe'] = {'features': selected_features_rfe, 'score': rfecv.grid_scores_[-1]}

    mi_selector = SelectKBest(score_func=mutual_info_classif, k=10)
    mi_selector.fit(X, y)

    selected_features_mi = [f for f, s in zip(X.columns, mi_selector.get_support()) if s]
    feature_sets['mutual_info'] = {'features': selected_features_mi, 'score': mi_selector.scores_}

    corr_matrix = X.corr().abs()
    high_corr_vars = [column for column in corr_matrix.columns if any(corr_matrix[column] > 0.75)]
    
    selected_features_corr = high_corr_vars
    feature_sets['correlation'] = {'features': selected_features_corr, 'score': np.mean([corr_matrix.loc[f, f] for f in high_corr_vars])}

    return feature_sets