In [1]:
import warnings
warnings.filterwarnings('ignore')

import ast
from catboost import CatBoostClassifier
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import optuna
import os

import pandas as pd
from pprint import pprint

import random

from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif, RFECV
from sklearn.inspection import permutation_importance
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

from scipy import stats

import time
from tqdm.notebook import tqdm

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv('smaller_train.csv')
full_train = pd.read_csv('train.csv')
validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

train.shape, test.shape

((68866, 38), (51012, 37))

In [3]:
features_list = ['id', 'Marital status', 'Application mode', 'Application order', 'Course', 
                 'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)', 
                 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", 
                 "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 
                 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 
                 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
                 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 
                 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
                 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 
                 'Unemployment rate', 'Inflation rate', 'GDP']

In [4]:
TARGET = 'Target'

In [5]:
# Label Encode the target

le = LabelEncoder()
le.fit(train[TARGET])
train[TARGET] = le.transform(train[TARGET])

In [6]:
X = train.drop(['id', TARGET], axis=1)
X_full = full_train.drop(['id', TARGET], axis=1)
y = train[TARGET]

n_splits = 10
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [7]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=50))
])

# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('nystroem', Nystroem(n_components=500, random_state=5)),
#     ('ridge', Ridge())
# ])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LogisticRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
# ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [8]:
models = [
    LogisticRegression(),
    linear_pipeline,
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    knn_pipeline,
    GaussianNB(),
]

In [9]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Accuracy', 
                                        'Model Test Accuracy', 
                                        'Model Test Accuracy Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Accuracy': 0,
                'Model Test Accuracy': 0,
                'Model Test Accuracy Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='accuracy', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Accuracy': cv_results['train_score'].mean(),
            'Model Test Accuracy': cv_results['test_score'].mean(),
            'Model Test Accuracy Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Accuracy'], ascending=False, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [None]:
# from sklearn.base import clone
# from scipy.stats import mode

# def evaluate_model(alg, idx, X, y, important_features, skf, model_predictions):
#     if hasattr(alg, 'name'):
#         model_name = alg.name
#     else:
#         model_name = alg.__class__.__name__
    
#     features = important_features.get(model_name, [])

#     if len(features) == 0:
#         print(f'Skipping {model_name} due to no important features.')
#         return {
#             'Model Name': model_name,
#             'Model Parameters': str(alg.get_params()),
#             'Model Train Accuracy': 0,
#             'Model Test Accuracy': 0,
#             'Model Test Accuracy Std': 0,
#             'Model Time': "0 min 0.00 sec",
#         }
    
#     fold_predictions = []

#     for train_index, test_index in skf.split(X[features], y):
#         X_train, X_test = X.iloc[train_index][features], X.iloc[test_index][features]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#         model = clone(alg)
#         model.fit(X_train, y_train)
#         preds = model.predict(X_test)
#         fold_predictions.append((test_index, preds))
    
#     model_predictions[model_name] = fold_predictions

#     # Cross-validation scores
#     cv_results = cross_validate(alg, 
#                                 X[features], 
#                                 y, cv=skf, 
#                                 scoring='accuracy', 
#                                 return_train_score=True, 
#                                 n_jobs=-1)

#     mean_fit_time = cv_results['fit_time'].mean()
#     minutes, seconds = divmod(mean_fit_time, 60)

#     result = {
#         'Model Name': model_name,
#         'Model Parameters': str(alg.get_params()),
#         'Model Train Accuracy': cv_results['train_score'].mean(),
#         'Model Test Accuracy': cv_results['test_score'].mean(),
#         'Model Test Accuracy Std': cv_results['test_score'].std(),
#         'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
#     }

#     print(f'Done with {model_name}.')
#     return result

# def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
#     Model_compare = pd.DataFrame(columns=['Model Name', 
#                                           'Model Parameters', 
#                                           'Model Train Accuracy', 
#                                           'Model Test Accuracy', 
#                                           'Model Test Accuracy Std', 
#                                           'Model Time'])
    
#     model_predictions = {}

#     def evaluate_model_and_store(alg, idx):
#         return evaluate_model(alg, idx, X, y, important_features, cv_split, model_predictions)

#     results_list = []

#     with ThreadPoolExecutor(max_workers=50) as executor:
#         futures = [executor.submit(evaluate_model_and_store, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
#         for future in tqdm(futures, total=len(futures), desc='Progress'):
#             result = future.result()
#             results_list.append(result)

#     model_compare = pd.DataFrame(results_list)

#     # Evaluate the mode-based model
#     mode_test_accuracies = []

#     for fold in range(n_splits):
#         fold_preds = []

#         for model_name, predictions in model_predictions.items():
#             fold_test_index, fold_predictions = predictions[fold]
#             if len(fold_preds) == 0:
#                 fold_preds = np.zeros((len(fold_test_index), len(models)))
            
#             fold_preds[:, list(model_predictions.keys()).index(model_name)] = fold_predictions.ravel()
        
#         mode_preds = mode(fold_preds, axis=1)[0].ravel()
#         mode_accuracy = accuracy_score(y.iloc[fold_test_index], mode_preds)
#         mode_test_accuracies.append(mode_accuracy)

#     # Create a DataFrame for mode-based model results
#     mode_model_result = pd.DataFrame([{
#         'Model Name': 'Mode Model',
#         'Model Parameters': 'N/A',
#         'Model Train Accuracy': 'N/A',
#         'Model Test Accuracy': np.mean(mode_test_accuracies),
#         'Model Test Accuracy Std': np.std(mode_test_accuracies),
#         'Model Time': 'N/A'
#     }])

#     # Concatenate the mode-based model results with the model comparison DataFrame
#     model_compare = pd.concat([model_compare, mode_model_result], ignore_index=True)

#     model_compare.sort_values(by=['Model Test Accuracy'], ascending=False, inplace=True)
#     # model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

#     return model_compare

In [19]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [20]:
with open('baseline_features.txt', mode='w') as f:
    pprint(baseline_features, stream=f)

In [None]:
# # Read and parse the text file containing the best features so far for each model

# with open('baseline_features.txt', 'r') as f:
#     file_content = f.read()

# # Parse the contents
# baseline_features = ast.literal_eval(file_content)

In [None]:
%%time

baseline_models = evaluate_models(models, X, y, baseline_features, sk10, f'{experiment_name}')
baseline_models

### Mutual Information

In [10]:
# To ensure the same randomness everytime
np.random.seed(5)

X_mi = train.copy()

# Add random feature
X_mi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X_mi['random_feature_categorical'] = np.random.randint(1, 8, X.shape[0])
X_mi.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,random_feature_continous,random_feature_categorical
0,34908,1,1,2,9853,1,1,123.0,1,37,37,0,0,122.6,1,0,0,1,0,0,18,0,0,6,6,6,13.666667,0,0,6,6,6,14.0,0,7.6,2.6,0.32,2,-1.112027,5
1,42347,1,1,1,9500,1,1,129.0,1,38,38,5,10,122.8,0,0,0,1,0,0,20,0,0,7,9,7,13.394286,0,0,8,12,8,12.48125,0,15.5,2.8,-4.06,2,1.482929,4
2,4198,1,1,1,9254,1,1,125.0,1,19,38,9,7,118.5,1,0,0,1,0,0,19,0,0,6,10,4,12.0,0,0,6,14,3,11.0,0,7.6,2.6,0.32,1,-1.173123,1
3,42080,1,1,1,9238,1,1,129.0,1,19,1,5,5,118.0,1,0,0,1,0,1,19,0,0,6,6,6,11.833333,0,0,6,7,6,12.428571,0,13.9,-0.3,0.79,2,1.674444,4
4,35876,1,39,1,9085,1,1,133.1,1,37,37,90,90,128.2,0,0,0,1,0,0,27,0,0,5,9,0,0.0,0,0,5,5,0,0.0,0,7.6,2.6,0.32,0,-0.046355,2


In [11]:
# Initialize parameters
random_states = [5, 42, 100, 500]
n_neighbors_list = [3, 5, 7, 10, 20]
results = defaultdict(list)

In [12]:
# Calculate MI for each combination of random_state and n_neighbors
for random_state in random_states:
    for n_neighbors in n_neighbors_list:        
        # Calculate MI
        mi = mutual_info_classif(X_mi, y, n_neighbors=n_neighbors, random_state=random_state)
        
        # Store results if the target has the highest MI score
        mi_dict = dict(zip(X_mi.columns, mi))
        if mi_dict[TARGET] == max(mi_dict.values()):
            for feature, score in mi_dict.items():
                results[feature].append(score)

        print(f'Done with Random State - {random_state} and N Neighbors - {n_neighbors}')

Done with Random State - 5 and N Neighbors - 3
Done with Random State - 5 and N Neighbors - 5
Done with Random State - 5 and N Neighbors - 7
Done with Random State - 5 and N Neighbors - 10
Done with Random State - 5 and N Neighbors - 20
Done with Random State - 42 and N Neighbors - 3
Done with Random State - 42 and N Neighbors - 5
Done with Random State - 42 and N Neighbors - 7
Done with Random State - 42 and N Neighbors - 10
Done with Random State - 42 and N Neighbors - 20
Done with Random State - 100 and N Neighbors - 3
Done with Random State - 100 and N Neighbors - 5
Done with Random State - 100 and N Neighbors - 7
Done with Random State - 100 and N Neighbors - 10
Done with Random State - 100 and N Neighbors - 20
Done with Random State - 500 and N Neighbors - 3
Done with Random State - 500 and N Neighbors - 5
Done with Random State - 500 and N Neighbors - 7
Done with Random State - 500 and N Neighbors - 10
Done with Random State - 500 and N Neighbors - 20


In [13]:
# Average MI scores across valid combinations
average_mi = {feature: np.mean(scores) for feature, scores in results.items() if scores}
average_mi

{'id': 0.0,
 'Marital status': 0.013442262015576068,
 'Application mode': 0.08394191132854331,
 'Application order': 0.015462607233643243,
 'Course': 0.11665440095529958,
 'Daytime/evening attendance': 0.01076037194790661,
 'Previous qualification': 0.03646985974847376,
 'Previous qualification (grade)': 0.08942962554132458,
 'Nacionality': 0.003204782032390052,
 "Mother's qualification": 0.03045067701828962,
 "Father's qualification": 0.03000903225737874,
 "Mother's occupation": 0.029072816360056198,
 "Father's occupation": 0.025343657492436077,
 'Admission grade': 0.10623912325536103,
 'Displaced': 0.011713871025973377,
 'Educational special needs': 2.061585571597746e-05,
 'Debtor': 0.03252988206454059,
 'Tuition fees up to date': 0.10459860728978057,
 'Gender': 0.05450395642820528,
 'Scholarship holder': 0.09003079610183537,
 'Age at enrollment': 0.09573084138049673,
 'International': 0.001094084490046865,
 'Curricular units 1st sem (credited)': 0.0012714334851582799,
 'Curricular u

In [14]:
# Display results
sorted_mi = sorted(average_mi.items(), key=lambda x: x[1], reverse=True)
print("Average MI scores:", sorted_mi)

Average MI scores: [('Target', 1.0416595590929518), ('Curricular units 2nd sem (approved)', 0.47945964510743355), ('Curricular units 2nd sem (grade)', 0.41384653272045036), ('Curricular units 1st sem (approved)', 0.40569120684681137), ('Curricular units 1st sem (grade)', 0.3636648046656956), ('Curricular units 2nd sem (evaluations)', 0.19595784270808994), ('Curricular units 1st sem (evaluations)', 0.18718797661322817), ('Course', 0.11665440095529958), ('Admission grade', 0.10623912325536103), ('Tuition fees up to date', 0.10459860728978057), ('Age at enrollment', 0.09573084138049673), ('Scholarship holder', 0.09003079610183537), ('Previous qualification (grade)', 0.08942962554132458), ('Application mode', 0.08394191132854331), ('Curricular units 2nd sem (enrolled)', 0.08003984164419586), ('Curricular units 1st sem (enrolled)', 0.07689919980437801), ('Gender', 0.05450395642820528), ('Previous qualification', 0.03646985974847376), ('Debtor', 0.03252988206454059), ("Mother's qualification

*From the mutual information, Curricular units 2nd sem (approved), 0.47945964510743355, Curricular units 2nd sem (grade), 0.41384653272045036, Curricular units 1st sem (approved), 0.40569120684681137 are the most important features on a model agnostic basis*

*As expected, id is not an important feature but it shows that 'International' is also not important*

In [15]:
# Determine higher MI between 0 and random_feature
higher_threshold = max(0, average_mi.get('random_feature_categorical', 0))

# List features with MI higher than the threshold, excluding the target
mi_features_list = [feature for feature, score in sorted_mi if feature != TARGET and score > higher_threshold]
mi_features_list

['Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 1st sem (evaluations)',
 'Course',
 'Admission grade',
 'Tuition fees up to date',
 'Age at enrollment',
 'Scholarship holder',
 'Previous qualification (grade)',
 'Application mode',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 1st sem (enrolled)',
 'Gender',
 'Previous qualification',
 'Debtor',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Application order',
 'GDP',
 'Unemployment rate',
 'Marital status',
 'Inflation rate',
 'Displaced',
 'Daytime/evening attendance',
 'Curricular units 2nd sem (without evaluations)',
 'Curricular units 1st sem (without evaluations)',
 'Nacionality',
 'Curricular units 1st sem (credited)',
 'Curricular units 2nd sem (credited)',
 'International']

In [16]:
mi_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    mi_features[model_name] = mi_features_list

In [17]:
with open('mi_features.txt', mode='w') as f:
    pprint(mi_features, stream=f)

In [18]:
%%time

mi_models = evaluate_models(models, X, y, mi_features, sk10, f'{experiment_name}_mi')
mi_models

Models:   0%|          | 0/10 [00:00<?, ?it/s]

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Done with RandomForestClassifier.
Done with LogisticRegression.
Done with KNN.
Done with ExtraTreesClassifier.
Done with GaussianNB.
Done with HistGradientBoostingClassifier.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with XGBClassifier.
Done with CatBoostClassifier.
CPU times: total: 1.2 s
Wall time: 10min 19s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.864085,0.831804,0.002995,3 min 1.60 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.850381,0.830177,0.003783,0 min 6.81 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.881312,0.829902,0.004177,1 min 41.24 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.853868,0.8298,0.003406,0 min 13.04 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999979,0.825647,0.004533,0 min 26.15 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",1.0,0.822568,0.004865,0 min 27.94 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817138,0.816687,0.00338,0 min 2.08 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.800351,0.794107,0.003777,0 min 0.18 sec
9,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.765632,0.765458,0.005129,0 min 0.16 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.731853,0.731391,0.005357,0 min 3.76 sec


In [21]:
# Read and parse the text file containing the best features so far for each model

with open('feats_so_far_baseline.txt', 'r') as f:
    file_content = f.read()

# Parse the contents
feats_so_far = ast.literal_eval(file_content)

In [22]:
%%time

best_models = evaluate_models(models, X, y, feats_so_far, sk10, f'{experiment_name}_best')
best_models

Models:   0%|          | 0/10 [00:00<?, ?it/s]

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Done with LR Pipeline.
Done with GaussianNB.
Done with KNN.
Done with RandomForestClassifier.
Done with LGBMClassifier.
Done with HistGradientBoostingClassifier.
Done with ExtraTreesClassifier.
Done with XGBClassifier.
Done with LogisticRegression.
Done with CatBoostClassifier.
CPU times: total: 1.73 s
Wall time: 15min 9s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.863816,0.831818,0.003082,3 min 6.66 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.88106,0.830468,0.003754,2 min 27.61 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.853209,0.830381,0.003155,0 min 28.38 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.850381,0.830177,0.003783,0 min 13.85 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999979,0.825647,0.004533,0 min 51.58 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",1.0,0.822568,0.004865,0 min 55.14 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817172,0.816702,0.003463,0 min 2.52 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.800351,0.794107,0.003777,0 min 0.32 sec
9,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.765632,0.765458,0.005129,0 min 0.32 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.731853,0.731391,0.005357,0 min 6.25 sec


### Permutation Importance

In [23]:
# Generate a random feature for X
np.random.seed(5)

X_pi = X.copy()

# Add random feature
X_pi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X_pi['random_feature_categorical'] = np.random.randint(1, 8, X.shape[0])
X_pi.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,random_feature_continous,random_feature_categorical
0,1,1,2,9853,1,1,123.0,1,37,37,0,0,122.6,1,0,0,1,0,0,18,0,0,6,6,6,13.666667,0,0,6,6,6,14.0,0,7.6,2.6,0.32,-1.112027,5
1,1,1,1,9500,1,1,129.0,1,38,38,5,10,122.8,0,0,0,1,0,0,20,0,0,7,9,7,13.394286,0,0,8,12,8,12.48125,0,15.5,2.8,-4.06,1.482929,4
2,1,1,1,9254,1,1,125.0,1,19,38,9,7,118.5,1,0,0,1,0,0,19,0,0,6,10,4,12.0,0,0,6,14,3,11.0,0,7.6,2.6,0.32,-1.173123,1
3,1,1,1,9238,1,1,129.0,1,19,1,5,5,118.0,1,0,0,1,0,1,19,0,0,6,6,6,11.833333,0,0,6,7,6,12.428571,0,13.9,-0.3,0.79,1.674444,4
4,1,39,1,9085,1,1,133.1,1,37,37,90,90,128.2,0,0,0,1,0,0,27,0,0,5,9,0,0.0,0,0,5,5,0,0.0,0,7.6,2.6,0.32,-0.046355,2


In [26]:
%%time

perm_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

perm_importances = {model.name if hasattr(model, 'name') else model.__class__.__name__: [] for model in models}

for i, (train_idx, test_idx) in enumerate(perm_cv.split(X_pi, y)):
    X_train, X_test = X_pi.iloc[train_idx], X_pi.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model in models:
        if hasattr(model, 'name'):
            model_name = model.name
        else:
            model_name = model.__class__.__name__

        model.fit(X_train, y_train)
        # Calculate permutation importance
        result = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=5, scoring='accuracy')
        perm_importances[model_name].append(result.importances_mean)
        print(f'Done with {model_name}.')
    
    print(f'Done with Fold {i+1}', end='\n\n')

Done with LogisticRegression.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with XGBClassifier.
Done with RandomForestClassifier.
Done with ExtraTreesClassifier.
Done with HistGradientBoostingClassifier.
Done with CatBoostClassifier.
Done with KNN.
Done with GaussianNB.
Done with Fold 1

Done with LogisticRegression.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with XGBClassifier.
Done with RandomForestClassifier.
Done with ExtraTreesClassifier.
Done with HistGradientBoostingClassifier.
Done with CatBoostClassifier.
Done with KNN.
Done with GaussianNB.
Done with Fold 2

Done with LogisticRegression.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with XGBClassifier.
Done with RandomForestClassifier.
Done with ExtraTreesClassifier.
Done with HistGradientBoostingClassifier.
Done with CatBoostClassifier.
Done with KNN.
Done with GaussianNB.
Done with Fold 3

Done with LogisticRegression.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with XGBClassifier.
Done 

In [27]:
%%time

# Average importances across folds and export to CSV
for model_name, importances in perm_importances.items():
    avg_importance = np.mean(importances, axis=0)
    importance_df = pd.DataFrame({'Feature': X_pi.columns, 'Importance': avg_importance})
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    # Export to CSV
    importance_df.to_csv(f'.\permutation_importances\{model_name}_permutation_importance.csv', index=False)

print('Done with Permuation Importances', end='\n\n')

Done with Permuation Importances

CPU times: total: 0 ns
Wall time: 31.1 ms


In [28]:
directory = 'permutation_importances'

# Initialize a dictionary for the features
perm_important_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__
    print(f'Model: {model_name}')

    csv_path = os.path.join(directory, f'{model_name}_permutation_importance.csv')
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Initialize importance variables
        random_feature_importance_cont = 0
        random_feature_importance_cat = 0
        
        # Check for 'random_feature_continous' and its importance
        if 'random_feature_continous' in df['Feature'].values:
            random_feature_importance_cont = df.loc[df['Feature'] == 'random_feature_continous', 'Importance'].iloc[0]
            print(random_feature_importance_cont)
        if 'random_feature_categorical' in df['Feature'].values:
            random_feature_importance_cat = df.loc[df['Feature'] == 'random_feature_categorical', 'Importance'].iloc[0]
            print(random_feature_importance_cat)
        else:
            random_feature_importance = 0

        # Determine the threshold
        threshold = max(0, random_feature_importance_cont, random_feature_importance_cat)
        print(f'Threshold: {threshold}')

        # Filter features where importance is greater than 0
        important_feats_filtered = df[df['Importance'] > threshold]['Feature'].tolist()

        # # Reorder important_feats based on the predefined features_list
        # important_feats_ordered = [feat for feat in features_list if feat in important_feats_filtered]

        # Add to importance dictionary
        perm_important_features[model_name] = important_feats_filtered

    else:
        print(f'CSV file for {model_name} not found.')

print('Done getting important features dictionary')

Model: LogisticRegression
2.9050763151339257e-06
5.518379905271598e-05
Threshold: 5.518379905271598e-05
Model: LR Pipeline
-0.0002555691233316
3.48451021131213e-05
Threshold: 3.48451021131213e-05
Model: LGBMClassifier
0.0001481099751239
0.0001626357783974
Threshold: 0.0001626357783974
Model: XGBClassifier
-0.0010164625454159
-3.48478431489907e-05
Threshold: 0
Model: RandomForestClassifier
-0.0002439547218406
8.6985718813093e-06
Threshold: 8.6985718813093e-06
Model: ExtraTreesClassifier
-0.0003572246552253
-3.1946562114266736e-05
Threshold: 0
Model: HistGradientBoostingClassifier
0.0002700938723605
-0.0001277917305288
Threshold: 0.0002700938723605
Model: CatBoostClassifier
0.000145206796449
9.2932712387479e-05
Threshold: 0.000145206796449
Model: KNN
-8.711918617465122e-05
-8.422022447799104e-05
Threshold: 0
Model: GaussianNB
6.679714629911793e-05
5.227408406149436e-05
Threshold: 6.679714629911793e-05
Done getting important features dictionary


In [29]:
with open('perm_important_features.txt', mode='w') as f:
    pprint(perm_important_features, stream=f)

In [30]:
%%time

pi_models = evaluate_models(models, X, y, perm_important_features, sk10, f'{experiment_name}_pi')
pi_models

Models:   0%|          | 0/10 [00:00<?, ?it/s]

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Done with HistGradientBoostingClassifier.
Done with LGBMClassifier.
Done with XGBClassifier.
Done with LogisticRegression.
Done with LR Pipeline.
Done with GaussianNB.
Done with CatBoostClassifier.
Done with ExtraTreesClassifier.
Done with RandomForestClassifier.
Done with KNN.
CPU times: total: 1.48 s
Wall time: 11min 1s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.865008,0.831368,0.003657,3 min 10.43 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.850076,0.830918,0.004411,0 min 10.70 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.852324,0.830511,0.004106,0 min 12.61 sec
3,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.879439,0.829887,0.003148,1 min 56.38 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999792,0.824471,0.003817,0 min 33.40 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.99868,0.821116,0.003797,0 min 31.36 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817109,0.816702,0.003474,0 min 2.53 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.804162,0.798725,0.002983,0 min 0.20 sec
9,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.776679,0.776581,0.003498,0 min 0.19 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.73684,0.737723,0.006121,0 min 4.03 sec


- RFECV

In [None]:
%%time

# Initialize empty dictionary for RFECV features
rfecv_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__
		
    features = feats_so_far[model_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {model_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(model, cv=sk10, step=1, scoring='accuracy', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        rfecv_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')
    
    except ValueError:
        # In case of an error, keep the original order but filtered by features_list
        features_filtered = [feat for feat in features_list if feat in features]
        rfecv_features[model_name] = features_filtered
        print(f'{model_name} does not have coef_ or feature_importances_', end='\n\n')

In [None]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

In [None]:
%%time

rfecv_models = evaluate_models(models, X, y, rfecv_features, sk3, f'{experiment_name}_rfecv')
rfecv_models

### SFS

In [None]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    try:
        # features = feats_so_far[model_name]
        features = rfecv_features[model_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {model_name}')

        sfs = SFS(model,
            k_features='best',
            forward=False,
            floating=True,
            scoring='accuracy',
            verbose=2,
            n_jobs=-1,
            cv=sk3)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')

    except KeyError:
        print(f'{model_name} not in the dictionary.')

# LGBM 

In [None]:
with open('sfs_features_gb.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [None]:
%%time

sfs_models = evaluate_models(models, X, y, sfs_features, sk3, f'{experiment_name}_sfs')
sfs_models

- Single Best Model

In [None]:
# model = LGBMClassifier(n_jobs=-1, random_state=5)
model = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)

model.fit(X_full, full_train[TARGET])

In [None]:
preds = model.predict(validation[X.columns])
# preds = le.inverse_transform(preds)
preds_df = pd.DataFrame(preds, columns=['Target_pred'])
preds_df.head()

In [None]:
validation_score = accuracy_score(validation['Target'], preds)
validation_score

In [None]:
pred = model.predict(test.drop('id', axis=1))

# pred = le.inverse_transform(pred)

pred_df = pd.DataFrame(pred, columns=['Target'])

submission_df = pd.concat([test['id'], pred_df], axis=1)

submission_df.to_csv('cat_10cv_baseline_0.831818.csv', index=False)

- Take mode of all predictions

In [None]:
model1 = LogisticRegression()
model2 = linear_pipeline
model3 = LGBMClassifier(n_jobs=-1, random_state=5)
model4 = XGBClassifier(random_state=5)
model5 = RandomForestClassifier(random_state=5)
model6 = ExtraTreesClassifier(random_state=5)
model7 = HistGradientBoostingClassifier(random_state=5)
model8 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)
model9 = knn_pipeline
model10 = GaussianNB()

In [None]:
csv_files = []

def get_preds(model):
    # Get model name
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__
    model.fit(X, y)
    preds = model.predict(validation[X.columns])
    preds = le.inverse_transform(preds)
    preds_df = pd.DataFrame(preds, columns=['Target_pred'])

    file_name = f'{model_name}_preds.csv'
    csv_files.append(file_name)
    preds_df.to_csv(file_name, index=False)

    return None

In [None]:
get_preds(model1)
get_preds(model2)
get_preds(model3)
get_preds(model4)
get_preds(model5)
get_preds(model6)
get_preds(model7)
get_preds(model8)
get_preds(model9)
get_preds(model10)

In [None]:
csv_files

In [None]:
from scipy.stats import mode
# csv_files = ['LR Pipeline_preds.csv', 'LGBMClassifier_preds.csv', 'GaussianNB_preds.csv']

dfs = [pd.read_csv(file) for file in csv_files]

combined_df = pd.concat(dfs, axis=1)

rows_modes = mode(combined_df, axis=1)[0]

result_array = rows_modes.flatten()

In [None]:
validation_score = accuracy_score(validation['Target'], result_array)
validation_score