# Loading and preprocessing data

In [1]:
from pathlib import Path
import pandas as pd

data_dir = Path('data')

X = pd.read_csv(data_dir / "x_train.txt", sep=' ', header=None)
y = pd.read_csv(data_dir / 'y_train.txt', sep=' ', header=None).values.ravel()

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-2.619773,-2.619533,-1.19935,-1.083335,-1.00091,-0.366967,-2.164037,-1.210001,-0.658311,-1.489539,...,10.849925,10.343346,10.717519,7.709295,5.894554,12.416573,6.765269,16.243907,7.209524,8.082021
1,-1.415579,-1.782544,-2.88027,-1.958863,1.159968,0.27303,-1.628728,-0.175813,-0.916857,-0.570166,...,11.489417,5.195818,3.494627,5.529154,10.517576,15.697333,11.324938,12.18767,12.283861,5.032285
2,-2.745092,-1.382945,-1.626015,-1.28256,-0.663146,0.052349,-2.403322,-0.765073,-0.394354,-0.806624,...,13.934934,9.267515,4.705604,6.642557,14.658934,8.130767,7.194487,11.939354,11.65362,5.942778
3,0.618998,0.455364,-0.115081,0.64904,-0.862207,2.308504,0.526114,-1.094852,1.088656,-0.48121,...,12.021328,3.852231,11.059702,7.527268,7.25312,9.791136,6.089743,10.752796,5.778888,10.366363
4,-0.070694,-0.550509,-0.565556,-0.693065,-0.573089,-0.395862,0.00317,-0.981609,-0.505775,-0.75843,...,7.537788,11.229665,11.318915,6.622256,12.557882,5.52036,5.397359,13.152269,10.684779,9.816471


In [2]:
from src.utils import *
from src.feature_selection_methods import *

In [3]:
from sklearn.preprocessing import MinMaxScaler

to_drop = drop_highly_correlated_columns(X, threshold=0.8)
X = X.drop(columns=to_drop)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X.shape

(5000, 492)

# Models

Selected models for comparison:
- Random Forest
- SVM
- XGBoost
- CatBoost
- LightGBM

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

models = {
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'XGBoost': XGBClassifier,
    'CatBoost': CatBoostClassifier,
    'LightGBM': LGBMClassifier
}

In [5]:
from sklearn.metrics import make_scorer
# define scorer
custom_scorer = make_scorer(calculate_score, greater_is_better=True)
thresholds = np.arange(1, 50, 1)

## Comparing models with all features included

In [None]:
from tqdm import tqdm
from sklearn.model_selection import cross_val_score

results_df = [['Model name', 'Feature Selection Method', 'No Features', 'Mean Score', 'Std Score']]

for model_name, model_class in tqdm(models.items(), desc='Models'):
    model = model_class(random_state=42)
    scores = cross_val_score(model, X, y, cv=10, scoring=custom_scorer)
    results_df.append([model_name, 'None', X.shape[1], scores.mean(), scores.std()])

results_df = pd.DataFrame(results_df[1:], columns=results_df[0])
results_df.to_csv('model_comparison_with_all_features.csv', index=False)
print(results_df)

## Feature selection methods comparison with different threshold values

In [None]:
feature_selection_methods = {
    'XGBoost Feature Importance': xgb_feature_importance_selection,
    'SelectKBest F-classif': select_k_best,
    'SelectKBest mutual_info_classif': select_k_best_mutual_info_classif,
    'Random Forest Feature Importance': rf_feature_importance_selection,
    'Recursive Feature Elimination': rfe_selection,
}

results = {model_name: {method_name: {'No Features': [], 'Score': []} for method_name in feature_selection_methods.keys()}
           for model_name in models.keys()}

for n_features in tqdm(thresholds, desc='Thresholds'):
    for method_name, method in feature_selection_methods.items():
        X_selected = method(X, y, n_features)
        for model_name, model_class in models.items():
            if model_name == "LightGBM":
                model = model_class(random_state=42, verbose=-1)
            elif model_name == 'CatBoost':
                model = model_class(random_state=42, silent=True)
            else:
                model = model_class(random_state=42)
            cv_scores = cross_val_score(model, X_selected, y, cv=10, scoring=custom_scorer)
            scores = cv_scores - n_features * 200
            results[model_name][method_name]['Score'].append(scores)
            results[model_name][method_name]['No Features'].append(n_features)

Saving data

In [8]:
rows = []
for model_name, model_scores in results.items():
    for method_name, scores_dict in model_scores.items():
        scores = scores_dict['Score']
        n_features = scores_dict['No Features']
        for score, n_feature in zip(scores, n_features):
            rows.append({'Model name':  model_name,
                         'Feature Selection Method': method_name,
                         'No Features': n_feature,
                         'Score': score})

df_results = pd.DataFrame(rows)
df_results['Mean_Score'] = df_results.Score.apply(lambda x: np.mean(x))
df_results['Std_Score'] = df_results.Score.apply(lambda x: np.std(x))
df_results = df_results.explode('Score')
df_results.to_csv('model_and_method_comparison.csv', index=False)
df_results

Unnamed: 0,Model name,Feature Selection Method,No Features,Mean Score,Std Score
0,Random Forest,SelectKBest F-classif,1,4972.257028,190.702056
1,Random Forest,SelectKBest F-classif,2,4716.144578,298.283779
2,Random Forest,SelectKBest F-classif,3,4452.032129,280.093296
3,Random Forest,SelectKBest F-classif,4,4312.144578,294.136138
4,Random Forest,SelectKBest F-classif,5,4199.807229,366.994165
...,...,...,...,...,...
1220,LightGBM,PCA,45,-3996.048193,264.093446
1221,LightGBM,PCA,46,-4056.176707,421.808231
1222,LightGBM,PCA,47,-4412.321285,368.975709
1223,LightGBM,PCA,48,-4515.710843,264.722102


In [9]:
df_results.sort_values(by='Mean Score', ascending=False, inplace=True)
df_results

Unnamed: 0,Model name,Feature Selection Method,No Features,Mean Score,Std Score
103,Random Forest,Random Forest Feature Importance,6,5867.244980,340.598522
348,SVM,Random Forest Feature Importance,6,5831.116466,251.325135
1083,LightGBM,Random Forest Feature Importance,6,5779.052209,370.177659
104,Random Forest,Random Forest Feature Importance,7,5751.309237,268.988752
102,Random Forest,Random Forest Feature Importance,5,5706.570281,266.590014
...,...,...,...,...,...
979,CatBoost,PCA,49,-4864.192771,381.220756
734,XGBoost,PCA,49,-4879.903614,347.853607
1224,LightGBM,PCA,49,-4892.016064,257.187709
243,Random Forest,PCA,48,-4972.305221,275.673399
