In [None]:
!pip install catboost colorama openpyxl

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.ensemble import AdaBoostRegressor, VotingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix
import numpy as np
from tqdm import tqdm
from copy import deepcopy
from sklearn.model_selection import LeaveOneOut
from tqdm import tqdm
from colorama import Fore, Back, Style
from fasteda import fast_eda
import shap

from sklearn.metrics import log_loss
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="threadpoolctl")

from openTSNE import TSNE

In [None]:
BOLD_TXT =  Style.BRIGHT
GREEN_TXT = BOLD_TXT + Fore.GREEN
RESET_TXT = Style.RESET_ALL

In [None]:
TARGET = "AD_dx"
FEATURES = [
    "norm_sums_6100",
    "norm_sums_6950",
    "norm_sums_8800",
    "norm_silence_count_1250",
    "norm_silence_count_6100",
    "norm_silence_count_7050",
    "lng_short_rats_250",
    "lng_short_rats_400",
    "lng_short_rats_1050",
    "age",
    "gender",
    "kw_filler_rates",
    "kw_mean_IFD_imp",
    "kw_std_IFD_imp",
    "unia_repetition_rate",
   "unia_ADJ_rep_rate",	
    "unia_ADP_rep_rate",	
   "unia_ADV_rep_rate",	
    "unia_AUX_rep_rate",	
    "unia_CCONJ_rep_rate",	
  #  "unia_DET_rep_rate",	
   "unia_INTJ_rep_rate",	
    "unia_NOUN_rep_rate",	
   "unia_NUM_rep_rate",	
    "unia_PART_rep_rate",	
    "unia_PRON_rep_rate",	
    "unia_PROPN_rep_rate",	
   "unia_SCONJ_rep_rate",	
    "unia_VERB_rep_rate",	
    "unia_X_rep_rate",
    "mlm_sub_error_rate",
    "mlm_mean_ISED_imp",
    "mlm_std_ISED_imp",
    "kw_vague_term_rate",	
    "kw_vague_utt_ratio"
]

In [None]:
def scale_data(train_data, test_data):    
    scaler = MinMaxScaler() # StandardScaler
    scaled_train_data = scaler.fit_transform(train_data[FEATURES])
    scaled_test_data = scaler.transform(test_data[FEATURES])
    scaled_train_df = pd.concat([pd.DataFrame(scaled_train_data, columns=FEATURES, index=train_data.index), train_data[TARGET]], axis=1)
    scaled_test_df = pd.concat([pd.DataFrame(scaled_test_data, columns=FEATURES, index=test_data.index), test_data[TARGET]], axis=1)
    return scaled_train_df, scaled_test_df

train, test = scale_data(train, test)

In [None]:
def skf_cv(train_data, model, features, n_splits, n_repeats, kfold_seed, test=test,):
    model_name = str(model).split("(")[0]
    test_pred_binary_all = np.zeros((len(test), n_splits * n_repeats))
    oof_full = np.zeros(len(train_data))
    oof_probas = np.zeros(len(train_data))
    test_probas = np.zeros(len(test))
    f1_scores, prec_scores, rec_scores, acc_scores = [], [], [], []
    
    print(model_name)
    
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=kfold_seed)
    
    for i, (train_idx, val_idx) in enumerate(tqdm(rskf.split(train_data[features], train_data[TARGET]))):
        X_train, X_val = train_data[features].loc[train_idx], train_data[features].loc[val_idx]
        y_train, y_val = train_data[TARGET].loc[train_idx], train_data[TARGET].loc[val_idx]

        current_model = deepcopy(model)

        if model_name in ["LGBMRegressor", "LGBMClassifier"]:
            callbacks = [early_stopping(stopping_rounds=50)]
            current_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        elif model_name in ["XGBClassifier", "CatBoostClassifier"]:
            current_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=10000)
        else:
            current_model.fit(X_train, y_train)

        oof_preds_proba = current_model.predict_proba(X_val)[:, 1] 
        oof_probas[val_idx] = oof_preds_proba

        oof_pred_binary = current_model.predict(X_val)
        oof_full[val_idx] = oof_pred_binary

        test_pred_binary_all[:, i] = current_model.predict(test[features])
        test_preds_proba = current_model.predict_proba(test[features])[:, 1] 
        test_probas += test_preds_proba / rskf.get_n_splits()

        f1_scores.append(f1_score(y_val, oof_pred_binary))
        prec_scores.append(precision_score(y_val, oof_pred_binary))
        rec_scores.append(recall_score(y_val, oof_pred_binary))
        acc_scores.append(accuracy_score(y_val, oof_pred_binary))

    test_pred_binary = (np.mean(test_pred_binary_all, axis=1) > 0.5).astype(int)

    metrics = {
        'model': model_name,
        "cv": "RSKF",
        'cv_f1': np.mean(f1_scores),
        'test_f1': f1_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_precision': np.mean(prec_scores),
        'test_precision': precision_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_recall': np.mean(rec_scores),
        'test_recall': recall_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_accuracy': np.mean(acc_scores),
        'test_accuracy': accuracy_score(test[TARGET], test_pred_binary)
    }
    
    return metrics, oof_probas, test_probas



def loo_cv(train_data, model, features, test=test):
    """
    Train a model using LOO CV and compute F1, precision, recall, and accuracy metrics.
    
    Parameters:
    - train_data: DataFrame containing features and target.
    - model: Classifier object (e.g., LGBMClassifier, XGBClassifier, etc.).
    - features: List of feature column names.
    """
    model_name = str(model).split("(")[0]
    oof_full = np.zeros(len(train_data))
    oof_probas = np.zeros(len(train_data))
    test_probas = np.zeros(len(test))
    test_pred_binary_all = np.zeros((len(test), len(train_data)))

    loo = LeaveOneOut()
    
    for i, (train_idx, val_idx) in enumerate(tqdm(loo.split(train_data[features]), total=len(train_data))):
        X_train, X_val = train_data[features].loc[train_idx], train_data[features].loc[val_idx]
        y_train, y_val = train_data[TARGET].loc[train_idx], train_data[TARGET].loc[val_idx]
        
        current_model = deepcopy(model)
        
        if model_name == "LGBMClassifier":
            callbacks = [early_stopping(stopping_rounds=50, verbose=False)]
            current_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        elif model_name in ["XGBClassifier", "CatBoostClassifier"]:
            current_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        else:
            current_model.fit(X_train, y_train)
        
        oof_preds_proba = current_model.predict_proba(X_val)[:, 1] 
        oof_probas[val_idx] = oof_preds_proba

        oof_pred_binary = current_model.predict(X_val)
        oof_full[val_idx] = oof_pred_binary

        test_pred_binary_all[:, i] = current_model.predict(test[features])
        test_preds_proba = current_model.predict_proba(test[features])[:, 1] 
        test_probas += test_preds_proba / len(train_data)
    
    cm = confusion_matrix(train_data[TARGET], oof_full)
    test_pred_binary = (np.mean(test_pred_binary_all, axis=1) > 0.5).astype(int)

    metrics = {
        'model': model_name,
        "cv": "leave-one-out",
        'cv_f1': f1_score(train[TARGET], oof_full, average='binary', zero_division=0),
        'test_f1': f1_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_precision': precision_score(train[TARGET], oof_full, average='binary', zero_division=0),
        'test_precision': precision_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_recall': recall_score(train[TARGET], oof_full, average='binary', zero_division=0),
        'test_recall': recall_score(test[TARGET], test_pred_binary, average='binary', zero_division=0),
        'cv_accuracy': accuracy_score(train[TARGET], oof_full),
        'test_accuracy': accuracy_score(test[TARGET], test_pred_binary)
    }
    
    return metrics, cm, oof_probas, test_probas, test_pred_binary

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'max_depth': 1, 
    'n_estimators': 10000,
    'early_stopping_rounds': 50,
     'colsample_bytree': 0.4,
    'random_state': 42,
}


lgb_params = {
  'lambda_l1': 0.001,
   'max_depth': 1,
    'verbose': -1,
    'colsample_bytree': 0.2,
    'min_data_in_leaf': 12,
    'n_estimators': 10000,
    'random_state': 42,
    'objective': 'binary',
}

In [None]:
models = [
    LogisticRegression(solver='lbfgs', tol=1e-4, C=0.35),
    SVC(C=0.6, probability=True),
   HistGradientBoostingClassifier(random_state=1, min_samples_leaf=12, max_depth=3),
    LGBMClassifier(**lgb_params),
    XGBClassifier(**xgb_params),
   RandomForestClassifier(random_state=1, max_depth=3),
    KNeighborsClassifier(5)
]

In [None]:
result_df = pd.DataFrame()
cms = []

for e, model in enumerate(models):

    loo_metrics, cm, _, _ = loo_cv(
        train_data = train,
        model = model,
        features = FEATURES,
    )
    cms.append(cm)
    result_df = pd.concat([result_df, pd.DataFrame(loo_metrics, index=[e])], axis=0)

    skf_metrics, _, _ = skf_cv(
        train_data = train,
        model = model,
        features = FEATURES,
        n_splits = 10,
        n_repeats = 10,
        kfold_seed = 0
    )
    result_df = pd.concat([result_df, pd.DataFrame(skf_metrics, index=[e])], axis=0)

result_df = result_df.reset_index(drop=True)

In [None]:
result_df.sort_values("cv_f1", ascending=False).round(3).reset_index(drop=True)