In [1]:
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostClassifier
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np
import optuna

import pandas as pd

import random

from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

import time
from tqdm.notebook import tqdm

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv('train.csv')
# original = pd.read_csv('original.csv', sep=';')
test = pd.read_csv('test.csv')

train.shape, test.shape

((76518, 38), (51012, 37))

In [3]:
TARGET = 'Target'

In [4]:
# # Change all values in the train but not in the original dataset to abitrary number (like -99999)
# train.loc[train['Application mode'].isin([4, 9, 12, 26]), 'Application mode'] = -99999
# train.loc[train['Course'].isin([39, 979]), 'Course'] = -99999
# train.loc[train['Previous qualification'].isin([36, 37]), 'Previous qualification'] = -99999
# train.loc[train["Mother's qualification"].isin([7, 8, 15, 28]), "Mother's qualification"] = -99999
# train.loc[train["Father's qualification"].isin([15, 23, 24]), "Father's qualification"] = -99999
# train.loc[train["Mother's occupation"].isin([11, 38, 101, 103, 127, 163, 172]), "Mother's occupation"] = -99999
# train.loc[train["Father's occupation"].isin([12, 13, 19, 22, 39, 96, 148, 191]), "Father's occupation"] = -99999
# train.shape

In [5]:
X = train.drop(['id', TARGET], axis=1)
y = train[TARGET]

n_splits = 3
sk3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [6]:
le = LabelEncoder()
le.fit(y)
y_le = le.transform(y)
y_le

array([2, 0, 0, ..., 1, 0, 2])

In [7]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=50))
])

# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('nystroem', Nystroem(n_components=500, random_state=5)),
#     ('ridge', Ridge())
# ])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LogisticRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
# ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [8]:
models = [
    LogisticRegression(),
    linear_pipeline,
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    knn_pipeline,
    # ridge_pipeline,
]

In [9]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Accuracy', 
                                        'Model Test Accuracy', 
                                        'Model Test Accuracy Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Accuracy': 0,
                'Model Test Accuracy': 0,
                'Model Test Accuracy Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='accuracy', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Accuracy': cv_results['train_score'].mean(),
            'Model Test Accuracy': cv_results['test_score'].mean(),
            'Model Test Accuracy Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Accuracy'], ascending=False, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [10]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [11]:
%%time

baseline_models = evaluate_models(models, X, y_le, baseline_features, sk3, f'{experiment_name}')
baseline_models

Models:   0%|          | 0/9 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Done with LogisticRegression.
Done with ExtraTreesClassifier.
Done with LR Pipeline.
Done with XGBClassifier.
Done with HistGradientBoostingClassifier.
Done with LGBMClassifier.
Done with RandomForestClassifier.
Done with KNN.
Done with CatBoostClassifier.
CPU times: total: 719 ms
Wall time: 3min 23s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.854564,0.830876,0.002428,0 min 9.51 sec
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.867861,0.830654,0.001682,1 min 24.56 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.85604,0.830602,0.002196,0 min 15.66 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.888974,0.829504,0.001538,1 min 36.04 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999987,0.826132,0.001643,0 min 30.78 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",1.0,0.8231,0.001469,0 min 34.61 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817219,0.816514,0.001248,0 min 2.14 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.798655,0.792389,0.00109,0 min 0.21 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.742146,0.742178,0.008458,0 min 5.04 sec
