In [None]:
import logging
from datetime import datetime

current_file_name = "14_Mouse_Model"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn import svm

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression

import xgboost as xgb

from imblearn.ensemble import BalancedRandomForestClassifier

from imblearn.ensemble import BalancedBaggingClassifier

import random

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

from numpy import mean, std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats as sm_stats

import textwrap
import shap

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.questions import *
from helpers.utils import *
from helpers.machine_learning import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
path_to_data = "data\\13_Mouse_Data_Preparation\\metrics_data.csv"

In [None]:
df = pd.read_csv(path_to_data)
df.head()

In [None]:
features = [col for col in df.columns if col not in ["respondent", "page_name", "variant", "respondent_num", "female", "indicator_fg"]]

## NaN handling

In [None]:
# Replace missing values in max_deviation with the median
df["max_deviation"] = df["max_deviation"].fillna(df["max_deviation"].median())

In [None]:
# Count the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

## Random state

In [None]:
# Seed must be between 0 and 2**32 - 1
random_state = random.randint(0, 2**32 - 1)

print(random_state)

logging.info(f"random_state={random_state}")

In [None]:
random_state = 2516557290

In [None]:
random.seed(random_state)

## Advanced analytics

In [None]:
df

In [None]:
df.groupby("indicator_fg").count()

In [None]:
categorical_cols, continuous_cols = detect_categorical_columns(df)
categorical_cols

In [None]:
aa_categorical_features = ['page_name', 'variant', 'female']
aa_target = "indicator_fg"
aa_remove = ['respondent', 'respondent_num', aa_target, *aa_categorical_features]
aa_continuous_features = [f for f in df.columns if f not in aa_remove]

In [None]:
aa_path = 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\14_Mouse_Model\\stats\\aa_mouse_df.xlsx'

In [None]:
calculate_advanced_descriptive_stats(aa_target, aa_continuous_features, aa_categorical_features, df, aa_path)

## Train-Test Split

In [None]:
# Create test and train datasets, but keep all elaborations of the same respondent of the same variant in the same dataset

# Get unique respondents of each variant
unique_fg_respondents = df[df["variant"] == "FG"]["respondent"].unique()
unique_h_respondents = df[df["variant"] == "H"]["respondent"].unique()

print(len(unique_fg_respondents), len(unique_h_respondents))

train_fg_respondents = ['respondent_43', 'respondent_26', 'respondent_35', 'respondent_31', 'respondent_53', 'respondent_21', 'respondent_22', 'respondent_50', 'respondent_42', 'respondent_55', 'respondent_54', 'respondent_16', 'respondent_9', 'respondent_105', 'respondent_37', 'respondent_58', 'respondent_38', 'respondent_51', 'respondent_106', 'respondent_15', 'respondent_52', 'respondent_25', 'respondent_12', 'respondent_56', 'respondent_46', 'respondent_36']
train_h_respondents = ['respondent_8', 'respondent_24', 'respondent_42', 'respondent_17', 'respondent_29', 'respondent_108', 'respondent_30', 'respondent_39', 'respondent_58', 'respondent_10', 'respondent_19', 'respondent_53', 'respondent_45', 'respondent_52', 'respondent_33', 'respondent_16', 'respondent_21', 'respondent_32', 'respondent_23', 'respondent_35', 'respondent_47', 'respondent_48', 'respondent_31', 'respondent_20']

print("train_fg_respondents:", train_fg_respondents)
print("train_h_respondents:", train_h_respondents)
logging.info(f"train_fg_respondents: {train_fg_respondents}")
logging.info(f"train_h_respondents: {train_h_respondents}")

test_fg_respondents = ['respondent_104', 'respondent_18', 'respondent_34', 'respondent_40', 'respondent_45', 'respondent_48', 'respondent_49']
test_h_respondents = ['respondent_107', 'respondent_110', 'respondent_22', 'respondent_27', 'respondent_50', 'respondent_57', 'respondent_9']

print("test_fg_respondents:", test_fg_respondents)
print("test_h_respondents:", test_h_respondents)
logging.info(f"test_fg_respondents: {test_fg_respondents}")
logging.info(f"test_h_respondents: {test_h_respondents}")

# Save this split to file
if not os.path.exists("data\\14_Mouse_Model\\train_test_split"):
    os.makedirs("data\\14_Mouse_Model\\train_test_split")
with open(f"data\\14_Mouse_Model\\train_test_split\\{dt_string}.py", "w") as f:
    f.write("\n".join([f"train_fg_respondents = {train_fg_respondents}", f"train_h_respondents = {train_h_respondents}", f"test_fg_respondents = {test_fg_respondents}", f"test_h_respondents = {test_h_respondents}"]))

# Create train and test datasets
train_fg = df[(df["variant"] == "FG") & (df["respondent"].isin(train_fg_respondents))]
train_h = df[(df["variant"] == "H") & (df["respondent"].isin(train_h_respondents))]
test_fg = df[(df["variant"] == "FG") & (df["respondent"].isin(test_fg_respondents))]
test_h = df[(df["variant"] == "H") & (df["respondent"].isin(test_h_respondents))]

# Create train and test datasets
df_to_train = pd.concat([train_fg, train_h])
df_to_test = pd.concat([test_fg, test_h])

print(len(df_to_train), len(df_to_test))

In [None]:
print(f"{len(df_to_train[df_to_train['indicator_fg'] == 1])}/{len(df_to_train)} {len(df_to_train[df_to_train['indicator_fg'] == 1]) / len(df_to_train)}")
print(f"{len(df_to_test[df_to_test['indicator_fg'] == 1])}/{len(df_to_test)} {len(df_to_test[df_to_test['indicator_fg'] == 1]) / len(df_to_test)}")

## Plots before preprocessing

In [None]:
calculate_descriptive_stats('indicator_fg', features, df_to_train, 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\14_Mouse_Model\stats\\mouse_before_preprocessing_df_to_train.xlsx')

In [None]:
show_box_boxwithout_hist('indicator_fg', features, df_to_train, True)

In [None]:
fig, ax = plt.subplots(figsize=(120, 96))
df_corr = df_to_train[features + ['indicator_fg']].corr()

sns.heatmap(df_corr, ax=ax, annot=True, fmt=".3f")

## Undersampling / Oversampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Defining the undersampling strategy
rus = RandomUnderSampler(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop('indicator_fg', axis=1)
y_train = df_to_train['indicator_fg']

# Fitting the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_random_underresampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_random_underresampled['indicator_fg'] = y_resampled

# Now df_random_underresampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes: ", df_random_underresampled['indicator_fg'].value_counts())

In [None]:
from imblearn.under_sampling import NearMiss

# Defining the NearMiss strategy (Version 3 is commonly used)
nm = NearMiss(version=3)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop(["respondent", "page_name", "variant", "respondent_num"], axis=1)
y_train = df_to_train['indicator_fg']

# Applying NearMiss
X_resampled, y_resampled = nm.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_nearmiss_undersampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_nearmiss_undersampled['indicator_fg'] = y_resampled

# Now df_nearmiss_undersampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes after NearMiss: ", df_nearmiss_undersampled['indicator_fg'].value_counts())

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Defining the oversampling strategy
ros = RandomOverSampler(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop('indicator_fg', axis=1)
y_train = df_to_train['indicator_fg']

# Applying the oversampling strategy
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_random_oversampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_random_oversampled['indicator_fg'] = y_resampled

# Now df_random_oversampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes: ", df_random_oversampled['indicator_fg'].value_counts())

In [None]:
from imblearn.over_sampling import SMOTE

# Defining the SMOTE strategy
smote = SMOTE(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop(["respondent", "page_name", "variant", "respondent_num"], axis=1)
y_train = df_to_train['indicator_fg']

# Applying SMOTE to your training data
X_smoted, y_smoted = smote.fit_resample(X_train, y_train)

# Create a DataFrame from the SMOTEd data
df_smote_oversampled = pd.DataFrame(X_smoted, columns=X_train.columns)
df_smote_oversampled['indicator_fg'] = y_smoted

# Now df_smote_oversampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes with SMOTE: ", df_smote_oversampled['indicator_fg'].value_counts())


In [None]:
# sampling = "RandomUnderSampler" # Accuracy okolo 0.6, ale recall pre 1 obstojny, mnohokrat nad 0.6, precision ale velmi nizka, pod 0.1
# sampling = "NearMiss" # Accuracy pod 0.5, ale recall pre 1 obstojny, mnohokrat nad 0.6, precision ale velmi nizka, pod 0.2, ale vyssia ako 0.1
sampling = "RandomOverSampler" # Vysoka accuracy, aj okolo 0.7-0.8, pre 1 recall velmi nizky, mnohokrat pod 0.2, ale vyssi ako 0.1, precision velmi nizka, pod 0.2, ale vyssia ako 0.1
# sampling = "SMOTE" # Najlepsie asi, accuracy okolo 80, pre 1 precision aj recall okolo 0.3

In [None]:
if sampling == "RandomUnderSampler":
    df_to_train = df_random_underresampled
if sampling == "NearMiss":
    df_to_train = df_nearmiss_undersampled
if sampling == "RandomOverSampler":
    df_to_train = df_random_oversampled
if sampling == "SMOTE":
    df_to_train = df_smote_oversampled

## Normalize features

In [None]:
# Create tranformer that will normalize data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

cols_to_transform = [f for f in features if f not in ["female"]]

ct = ColumnTransformer([
        ('scaler', StandardScaler(), cols_to_transform)
    ], remainder='passthrough')

ct.set_output(transform="pandas")
print(df_to_train.shape, df_to_test.shape)

df_to_train = ct.fit_transform(df_to_train)
df_to_test = ct.transform(df_to_test)

# Remove prefix from columns
df_to_train.columns = df_to_train.columns.str.replace('scaler__', '')
df_to_train.columns = df_to_train.columns.str.replace('remainder__', '')
df_to_test.columns = df_to_test.columns.str.replace('scaler__', '')
df_to_test.columns = df_to_test.columns.str.replace('remainder__', '')

print(df_to_train.shape, df_to_test.shape)


## Save preprocessed datasets

In [None]:
# Save datasets
if not os.path.exists("data\\14_Mouse_Model\\datasets"):
    os.makedirs("data\\14_Mouse_Model\\datasets")
df_to_train.to_csv(f"data\\14_Mouse_Model\\datasets\\{dt_string}_train.csv", index=False)
df_to_test.to_csv(f"data\\14_Mouse_Model\\datasets\\{dt_string}_test.csv", index=False)

## Plots after preprocessing

In [None]:
calculate_descriptive_stats('indicator_fg', features, df_to_train, 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\14_Mouse_Model\stats\\mouse_after_preprocessing_df_to_train.xlsx')

In [None]:
show_box_boxwithout_hist('indicator_fg', features, df_to_train, True)

In [None]:
fig, ax = plt.subplots(figsize=(120, 96))
df_corr = df_to_train[features + ['indicator_fg']].corr()

sns.heatmap(df_corr, ax=ax, annot=True, fmt=".3f")

## T-Test and U-Test

In [None]:
feature_names = [col for col in df_to_train.columns if col not in ["respondent", "elaboration", "variant", "indicator_fg"]]
print(len(feature_names))
print(feature_names)

In [None]:
statistical_tests_selected_features = []
results = []

for feature_name in features:
    logging.info(f'++++++++++Test for {feature_name}++++++++++')
    if test_feature(df_to_train, feature_name, results, logging, ignore_power=False):
        statistical_tests_selected_features.append(feature_name)
    
print(statistical_tests_selected_features)

In [None]:
test_results = pd.DataFrame(results, columns = ['Feature', 'T-test statistic', 'T-test p-value', 'U-test statistic', 'U-test p-value', 'Power', 'Selected'])
relevant_test_results = test_results[['Feature', 'T-test statistic', 'T-test p-value', 'U-test statistic', 'U-test p-value', 'Power', 'Selected']]
relevant_test_results.index = np.arange(1, len(relevant_test_results) + 1)
relevant_test_results

In [None]:
relevant_test_results[relevant_test_results["Selected"] == True]

## Select statistically significant features

In [None]:
df_to_train

In [None]:
df_to_train = df_to_train[["respondent", "page_name", "variant", "indicator_fg"] + statistical_tests_selected_features]

## Feature selection

The following code is insipred by official documentation.

In [None]:
lasso = True

In [None]:
if lasso:
    try:
        X_train_lasso = df_to_train[features]
    except:
        X_train_lasso = df_to_train.drop(["respondent", "page_name", "variant", "indicator_fg"], axis=1)
    y_train_lasso = df_to_train['indicator_fg']

In [None]:
if lasso:
    lsvc = LinearSVC(C=0.03, penalty="l1", dual=False).fit(X_train_lasso, y_train_lasso)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X_train_lasso)
    X_new.shape

In [None]:
lasso_selected_features = []

In [None]:
if lasso:
    lasso_selected_features = X_train_lasso.columns[(model.get_support())]
    lasso_selected_features = list(lasso_selected_features)
lasso_selected_features

In [None]:
len(lasso_selected_features)

In [None]:
if lasso:
    export_lasso_df = pd.DataFrame({'Feature': list(X_train_lasso.columns), 'Weight': lsvc.coef_.tolist()[0]}) 
    export_lasso_df['Selected'] = export_lasso_df['Feature'].apply(lambda x: x in lasso_selected_features)
    export_lasso_df.index = np.arange(1, len(export_lasso_df) + 1)
    export_lasso_df

## Use selected features only

In [None]:
if lasso:
    df_to_test = df_to_test[lasso_selected_features + ["indicator_fg"]]
    df_to_train = df_to_train[lasso_selected_features + ["indicator_fg"]]
else:
    df_to_test = df_to_test[statistical_tests_selected_features + ["indicator_fg"]]
    df_to_train = df_to_train[statistical_tests_selected_features + ["indicator_fg"]]

print(len(df_to_train), len(df_to_test))

In [None]:
str(df_to_test.columns.to_list())

## Save selected features

In [None]:
# Save selected columns to file
if not os.path.exists("data\\14_Mouse_Model\\selected_columns"):
    os.makedirs("data\\14_Mouse_Model\\selected_columns")
with open(f"data\\14_Mouse_Model\\selected_columns\\{dt_string}.py", "w") as f:
    f.write("\n".join([f"df_to_test_cols = {str(df_to_test.columns.to_list())}", f"df_to_train_cols = {str(df_to_train.columns.to_list())}"]))

## Shuffle

In [None]:
# Shuffle the data
df_to_train = df_to_train.sample(frac=1).reset_index(drop=True)
df_to_test = df_to_test.sample(frac=1).reset_index(drop=True)

In [None]:
X_train = df_to_train.drop(["indicator_fg"], axis=1).reset_index(drop=True)
X_test = df_to_test.drop(["indicator_fg"], axis=1).reset_index(drop=True)
y_train = df_to_train['indicator_fg'].astype(int).reset_index(drop=True)
y_test = df_to_test['indicator_fg'].astype(int).reset_index(drop=True)

In [None]:
# Number of indicators with value 1 in each dataset
print(y_train.value_counts())
print(y_test.value_counts())

## Controlling

In [None]:
shap_plots = True

In [None]:
def path_generator(model):
    dir_path = "data\\14_Mouse_Model\\models"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    dir_path = f"data\\14_Mouse_Model\\models\\{model}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    return f"data\\14_Mouse_Model\\models\\{model}\\{dt_string}.joblib"

In [None]:
global_report = None

## Decision Tree

The following function is taken from my project developed on the subject Intelligent Data Analysis 2021/2022.

In [None]:
decision_tree_param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

clf = DecisionTreeClassifier(random_state=random_state)

clf1, best_params1, train_report1, test_report1 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("decision_tree"), decision_tree_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report1, test_report1, "decision_tree", best_params1)

In [None]:
# Get feature importances
importances = clf1.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf1, X_train, X_test, tree=True, pos_class=True)

## Random Forest

The following function is taken from my project developed on the subject Intelligent Data Analysis 2021/2022.

In [None]:
random_forest_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

clf = RandomForestClassifier(random_state=random_state)

clf2, best_params2, train_report2, test_report2 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("random_forest"), random_forest_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report2, test_report2, "random_forest", best_params2)

In [None]:
# Get feature importances
importances = clf2.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf2, X_train, X_test, tree=True, pos_class=True)

## SVM

In [None]:
linear_svm_param_grid = {
    'C': [0.1, 1, 10, 100]
}

clf = svm.SVC(kernel='linear', random_state=random_state)

clf3_a, best_params3_a, train_report3_a, test_report3_a = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("linear_svm"), linear_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_a, test_report3_a, "linear_svm", best_params3_a)

In [None]:
if shap_plots:
    calculate_shap(clf3_a, X_train, X_test)

In [None]:
poly_svm_param_grid = {
    'degree': [2, 3, 4],
    'coef0': [0, 1, 10] 
}

clf = svm.SVC(kernel='poly', random_state=random_state)

clf3_b, best_params3_b, train_report3_b, test_report3_b = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("poly_svm"), poly_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_b, test_report3_b, "poly_svm", best_params3_b)

In [None]:
rbf_svm_param_grid = {
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
}

clf = svm.SVC(kernel='rbf', random_state=random_state)

clf3_c, best_params3_c, train_report3_c, test_report3_c = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("rbf_svm"), rbf_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_c, test_report3_c, "rbf_svm", best_params3_c)

In [None]:
sigmoid_svm_param_grid = {
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'coef0': [0, 1, 10]
}

clf = svm.SVC(kernel='sigmoid', random_state=random_state)

clf3_d, best_params3_d, train_report3_d, test_report3_d = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("sigmoid_svm"), sigmoid_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_d, test_report3_d, "sigmoid_svm", best_params3_d)

## Gradient Boosting

In [None]:
gradient_boosting_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = GradientBoostingClassifier(random_state=random_state)

clf4, best_params4, train_report4, test_report4 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("gradient_boosting"), gradient_boosting_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report4, test_report4, "gradient_boosting", best_params4)

In [None]:
# Get feature importances
importances = clf4.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf4, X_train, X_test, tree=True)

## Logistic Regression

In [None]:
logistic_regression_param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['None', 'l2', 'l1', 'elasticnet']
}

clf = LogisticRegression(max_iter=2000000, random_state=random_state)

clf5, best_params5, train_report5, test_report5 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("logistic_regression"), logistic_regression_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report5, test_report5, "logistic_regression", best_params5)

In [None]:
if shap_plots:
    calculate_shap(clf5, X_train, X_test)

## XGBoost 

In [None]:
xgboost_param_grid = {
    'n_estimators': [100, 200, 300, 800],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'objective': ['binary:hinge', 'binary:logistic', 'binary:logitraw']
}

clf = xgb.XGBClassifier(objective='binary:hinge', random_state=random_state)

clf6, best_params6, train_report6, test_report6 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("xgboost"), xgboost_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report6, test_report6, "xgboost", best_params6)

In [None]:
if shap_plots:
    calculate_shap(clf6, X_train, X_test, tree=True)

## Balanced Random Forest

In [None]:
balanced_random_forest_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = BalancedRandomForestClassifier(random_state=random_state)

clf7, best_params7, train_report7, test_report7 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("balanced_random_forest"), balanced_random_forest_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report7, test_report7, "balanced_random_forest", best_params7)

In [None]:
# Get feature importances
importances = clf7.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf7, X_train, X_test, tree=True, pos_class=True)

## Balanced Bagging Classifier

In [None]:
balanced_bagging_classifier_param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

clf = BalancedBaggingClassifier(random_state=random_state, estimator=None, n_estimators=10, 
                                max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, 
                                oob_score=False, warm_start=False, sampling_strategy='auto', replacement=False, 
                                n_jobs=None, verbose=0, sampler=None)

clf8, best_params8, train_report8, test_report8 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("balanced_bagging_classifier"), balanced_bagging_classifier_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report8, test_report8, "balanced_bagging_classifier", best_params8)

In [None]:
clf8.feature_names_in_

## Report

In [None]:
global_report

In [None]:
# Save global report
if not os.path.exists("data\\14_Mouse_Model\\report"):
    os.makedirs("data\\14_Mouse_Model\\report")
path_to_save = f"data\\14_Mouse_Model\\report\\{dt_string}.csv"
global_report["metric"] = global_report.index
global_report.to_csv(path_to_save, index=False, sep=";")