In [1]:
import warnings
warnings.filterwarnings('ignore')

import ast
from catboost import CatBoostClassifier
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import optuna

import pandas as pd
from pprint import pprint

import random

from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.inspection import permutation_importance
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

import time
from tqdm.notebook import tqdm

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv('train.csv')
# original = pd.read_csv('original.csv', sep=';')
test = pd.read_csv('test.csv')

train.shape, test.shape

((76518, 38), (51012, 37))

In [None]:
features_list = ['id', 'Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP']

In [3]:
TARGET = 'Target'

In [4]:
# Label Encode the target

le = LabelEncoder()
le.fit(train[TARGET])
train[TARGET] = le.transform(train[TARGET])

In [5]:
X = train.drop(['id', TARGET], axis=1)
y = train[TARGET]

n_splits = 3
sk3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [6]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=50))
])

# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('nystroem', Nystroem(n_components=500, random_state=5)),
#     ('ridge', Ridge())
# ])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LogisticRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
# ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [7]:
models = [
    LogisticRegression(),
    linear_pipeline,
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    knn_pipeline,
    # ridge_pipeline,
]

In [8]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Accuracy', 
                                        'Model Test Accuracy', 
                                        'Model Test Accuracy Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Accuracy': 0,
                'Model Test Accuracy': 0,
                'Model Test Accuracy Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='accuracy', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Accuracy': cv_results['train_score'].mean(),
            'Model Test Accuracy': cv_results['test_score'].mean(),
            'Model Test Accuracy Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Accuracy'], ascending=False, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [9]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [21]:
with open('baseline_features.txt', mode='w') as f:
    pprint(baseline_features, stream=f)

In [22]:
# Read and parse the text file containing the best features so far for each model

with open('baseline_features.txt', 'r') as f:
    file_content = f.read()

# Parse the contents
baseline_features = ast.literal_eval(file_content)

In [23]:
%%time

baseline_models = evaluate_models(models, X, y, baseline_features, sk3, f'{experiment_name}_test')
baseline_models

Models:   0%|          | 0/9 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Done with ExtraTreesClassifier.
Done with LR Pipeline.
Done with LGBMClassifier.
Done with LogisticRegression.
Done with RandomForestClassifier.
Done with HistGradientBoostingClassifier.
Done with XGBClassifier.
Done with KNN.
Done with CatBoostClassifier.
CPU times: total: 891 ms
Wall time: 3min 21s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.854564,0.830876,0.002428,0 min 8.45 sec
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.868345,0.830863,0.001553,1 min 38.49 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.85604,0.830602,0.002196,0 min 18.11 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.888556,0.829478,0.001629,1 min 37.97 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999987,0.826132,0.001643,0 min 36.37 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",1.0,0.8231,0.001469,0 min 36.29 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817082,0.816527,0.001217,0 min 3.24 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.798956,0.792729,0.001035,0 min 0.23 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.742146,0.742178,0.008458,0 min 4.54 sec


### Mutual Information

In [10]:
# To ensure the same randomness everytime
np.random.seed(5)

X_mi = train.copy()

# Add random feature
X_mi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X_mi['random_feature_categorical'] = np.random.randint(1, 8, X.shape[0])
X_mi.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,random_feature_continous,random_feature_categorical
0,0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.5,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,2,-1.112027,7
1,1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.6,0,0,6,9,0,0.0,0,11.1,0.6,2.02,0,1.482929,1
2,2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,16.2,0.3,-0.92,0,-1.173123,4
3,3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.59125,0,0,8,11,7,12.82,0,11.1,0.6,2.02,1,1.674444,2
4,4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,2,-0.046355,7


In [11]:
# Initialize parameters
random_states = [5, 42, 100, 500]
n_neighbors_list = [3, 5, 7, 10, 20]
results = defaultdict(list)

In [12]:
# Calculate MI for each combination of random_state and n_neighbors
for random_state in random_states:
    for n_neighbors in n_neighbors_list:        
        # Calculate MI
        mi = mutual_info_classif(X_mi, y, n_neighbors=n_neighbors, random_state=random_state)
        
        # Store results if the target has the highest MI score
        mi_dict = dict(zip(X_mi.columns, mi))
        if mi_dict[TARGET] == max(mi_dict.values()):
            for feature, score in mi_dict.items():
                results[feature].append(score)

        print(f'Done with Random State - {random_state} and N Neighbors - {n_neighbors}')

Done with Random State - 5 and N Neighbors - 3
Done with Random State - 5 and N Neighbors - 5
Done with Random State - 5 and N Neighbors - 7
Done with Random State - 5 and N Neighbors - 10
Done with Random State - 5 and N Neighbors - 20
Done with Random State - 42 and N Neighbors - 3
Done with Random State - 42 and N Neighbors - 5
Done with Random State - 42 and N Neighbors - 7
Done with Random State - 42 and N Neighbors - 10
Done with Random State - 42 and N Neighbors - 20
Done with Random State - 100 and N Neighbors - 3
Done with Random State - 100 and N Neighbors - 5
Done with Random State - 100 and N Neighbors - 7
Done with Random State - 100 and N Neighbors - 10
Done with Random State - 100 and N Neighbors - 20
Done with Random State - 500 and N Neighbors - 3
Done with Random State - 500 and N Neighbors - 5
Done with Random State - 500 and N Neighbors - 7
Done with Random State - 500 and N Neighbors - 10
Done with Random State - 500 and N Neighbors - 20


In [13]:
# Average MI scores across valid combinations
average_mi = {feature: np.mean(scores) for feature, scores in results.items() if scores}
average_mi

{'id': 7.679849504738102e-06,
 'Marital status': 0.012155729135268268,
 'Application mode': 0.08470962258196191,
 'Application order': 0.01547560178027565,
 'Course': 0.11611352967755037,
 'Daytime/evening attendance': 0.010976289200434652,
 'Previous qualification': 0.03631704209021589,
 'Previous qualification (grade)': 0.08941972794988672,
 'Nacionality': 0.002653404963438399,
 "Mother's qualification": 0.030461537361534043,
 "Father's qualification": 0.030216884804976883,
 "Mother's occupation": 0.028756029877175182,
 "Father's occupation": 0.02514503223812784,
 'Admission grade': 0.10609902918453513,
 'Displaced': 0.012521772564679002,
 'Educational special needs': 0.0008790835560824606,
 'Debtor': 0.032282490712448375,
 'Tuition fees up to date': 0.10519539217958995,
 'Gender': 0.05686732025162259,
 'Scholarship holder': 0.08967825584897862,
 'Age at enrollment': 0.09629672195737235,
 'International': 0.0006932129544406473,
 'Curricular units 1st sem (credited)': 0.00134637160274

In [14]:
# Display results
sorted_mi = sorted(average_mi.items(), key=lambda x: x[1], reverse=True)
print("Average MI scores:", sorted_mi)

Average MI scores: [('Target', 1.0420658859330165), ('Curricular units 2nd sem (approved)', 0.4784230420154224), ('Curricular units 2nd sem (grade)', 0.41468714721720107), ('Curricular units 1st sem (approved)', 0.4054222826092996), ('Curricular units 1st sem (grade)', 0.3625405765405194), ('Curricular units 2nd sem (evaluations)', 0.19494224180305916), ('Curricular units 1st sem (evaluations)', 0.18586588342161375), ('Course', 0.11611352967755037), ('Admission grade', 0.10609902918453513), ('Tuition fees up to date', 0.10519539217958995), ('Age at enrollment', 0.09629672195737235), ('Scholarship holder', 0.08967825584897862), ('Previous qualification (grade)', 0.08941972794988672), ('Application mode', 0.08470962258196191), ('Curricular units 2nd sem (enrolled)', 0.07903347111907727), ('Curricular units 1st sem (enrolled)', 0.07636127564918703), ('Gender', 0.05686732025162259), ('Previous qualification', 0.03631704209021589), ('Debtor', 0.032282490712448375), ("Mother's qualification"

*From the mutual information, Curricular units 2nd sem (approved), 0.4784230420154224, Curricular units 2nd sem (grade), 0.41468714721720107, Curricular units 1st sem (approved), 0.4054222826092996 are the most important features on a model agnostic basis*

*As expected, id is not an important feature but it shows that 'International' is also not important*

In [15]:
# Determine higher MI between 0 and random_feature
higher_threshold = max(0, average_mi.get('random_feature_categorical', 0))

# List features with MI higher than the threshold, excluding the target
mi_features_list = [feature for feature, score in sorted_mi if feature != TARGET and score > higher_threshold]
mi_features_list

['Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 1st sem (evaluations)',
 'Course',
 'Admission grade',
 'Tuition fees up to date',
 'Age at enrollment',
 'Scholarship holder',
 'Previous qualification (grade)',
 'Application mode',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 1st sem (enrolled)',
 'Gender',
 'Previous qualification',
 'Debtor',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Application order',
 'Unemployment rate',
 'GDP',
 'Inflation rate',
 'Displaced',
 'Marital status',
 'Daytime/evening attendance',
 'Curricular units 2nd sem (without evaluations)',
 'Curricular units 1st sem (without evaluations)',
 'Nacionality',
 'Curricular units 2nd sem (credited)',
 'Curricular units 1st sem (credited)',
 'Educational special needs',
 'Inte

In [16]:
mi_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    mi_features[model_name] = mi_features_list

In [17]:
with open('mi_features.txt', mode='w') as f:
    pprint(mi_features, stream=f)

In [28]:
# Read and parse the text file containing the best features so far for each model

with open('feats_so_far_baseline.txt', 'r') as f:
    file_content = f.read()

# Parse the contents
feats_so_far = ast.literal_eval(file_content)

In [29]:
%%time

mi_models = evaluate_models(models, X, y, feats_so_far, sk3, f'{experiment_name}_mi_test')
mi_models

Models:   0%|          | 0/9 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Done with LGBMClassifier.
Done with LR Pipeline.
Done with RandomForestClassifier.
Done with LogisticRegression.
Done with HistGradientBoostingClassifier.
Done with ExtraTreesClassifier.
Done with XGBClassifier.
Done with KNN.
Done with CatBoostClassifier.
CPU times: total: 547 ms
Wall time: 3min 1s


Unnamed: 0,Model Name,Model Parameters,Model Train Accuracy,Model Test Accuracy,Model Test Accuracy Std,Model Time
7,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.868044,0.831059,0.001719,1 min 35.88 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.854564,0.830876,0.002428,0 min 7.04 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.85604,0.830602,0.002196,0 min 14.69 sec
3,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.887608,0.829792,0.001422,1 min 27.60 sec
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999987,0.826132,0.001643,0 min 31.91 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",1.0,0.823414,0.00109,0 min 30.79 sec
1,LR Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.817219,0.816514,0.001248,0 min 2.03 sec
8,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.798956,0.792729,0.001035,0 min 0.21 sec
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.739545,0.739382,0.005111,0 min 4.93 sec


- Single Best Model

In [None]:
model = LGBMClassifier(n_jobs=-1, random_state=5)

model.fit(X, y)

In [None]:
pred = model.predict(test.drop('id', axis=1))

pred = le.inverse_transform(pred)

pred_df = pd.DataFrame(pred, columns=['Target'])

submission_df = pd.concat([test['id'], pred_df], axis=1)

submission_df.to_csv('lgbm_baseline_0.830876.csv', index=False)