# Libraries

In [None]:
# Importing relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Sampler
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler

# modelling packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# model evaluation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# EDA 

## Explore Categorical Features

## Explore Numerical Features

## Explore Target Variable

## Relationship between Features and Target

# Data Engineering

## Explore and manage missing values

## Train Test Split

## Data Preprocessing

### Sampling Method 1

### Sampling Method 2

## Sampling Method 3

### Sampling Method 4

# Modelling

## Model Building

### Common functions for reproducibility

### Base model

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42, max_iter=1000)

### Models with sampling method 1

### Models with sampling method 2

### Models with sampling method 3

### Models with sampling method 4

This final sampling method is a hybrid method of sampling which combines SMOTE-NC and RUS

https://www.mdpi.com/2078-2489/14/1/54

#### SMOTE-NC

In [None]:
# Identify categorical columns (indices)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
categorical_indices = [X.columns.get_loc(col) for col in categorical_features]

# Initialize SMOTE-NC
desired_ratio = 0.8  # example: make minority class 80% of majority
smote_nc = SMOTENC(categorical_features=categorical_indices, sampling_strategy=desired_ratio, k_neighbors=5, random_state=42)

# Fit and resample
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

#### RUS

In [None]:
# Initialize RUS to match minority class count
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Apply undersampling
X_hybrid, y_hybrid = rus.fit_resample(X_resampled, y_resampled)


### Visualize the differences

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), ncols=3, nrows=1)

# Original y distribution
sns.countplot(x=y, ax=ax[0])
ax[0].set_title('Original y Distribution')

# Resampled y distribution after SMOTE-NC
sns.countplot(x=y_resampled, ax=ax[1])
ax[1].set_title('y Distribution after SMOTE-NC')

# Final y distribution after undersampling
sns.countplot(x=y_hybrid, ax=ax[2])
ax[2].set_title('y Distribution after Undersampling')
plt.tight_layout()
plt.show()


## Model Validation and Hyperparameter Tuning

## Model Evaluation

# Implications

# 