# Libraries

In [2]:
# Importing relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Sampler
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler

# modelling packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# model evaluation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# EDA 

## Explore Categorical Features

## Explore Numerical Features

## Explore Target Variable

## Relationship between Features and Target

# Data Engineering

## Explore and manage missing values

## Train Test Split

In [None]:
# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_clean, y_clean, stratify=y_clean, test_size=0.2, random_state=42
)


## Data Preprocessing

In [None]:
# Identify categorical columns (indices)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
categorical_indices = [X.columns.get_loc(col) for col in categorical_features]

In [None]:
preprocessing = ColumnTransformer(transformers=[
    ('num', StandardScaler(), NumCols), # Numerical columns
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), CatCols) # Categorical columns
], remainder='passthrough') # Keep any other columns unchanged

# Fit-transform training data (transform validation data later)
X_train_processed = preprocessor.fit_transform(X_train)

### Sampling Method 1 - SMOTE-NC

In [None]:
# Initialize SMOTE-NC
smote_nc = SMOTENC(categorical_features=categorical_indices, sampling_strategy='auto', k_neighbors=5, random_state=42)

# Fit and resample
X_train_smote_nc, y_smote_nc = smote_nc.fit_resample(X_train, y_train)

### Sampling Method 2 - SMOTE

In [None]:
# Initialize SMOTE
smote = SMOTENC( sampling_strategy='auto', k_neighbors=5, random_state=42)

# Apply SMOTE on processed numeric + one-hot encoded data
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

### Sampling Method 3 - Random Under Sampler

In [None]:
# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Apply under-sampling on processed numeric + one-hot encoded data
X_train_rus, y_train_rus = rus.fit_resample(X_train_processed, y_train)

In [3]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

def plot_2d_pca(X_list, y_list, titles, sample_size=1000):
    plt.figure(figsize=(16, 4 * len(X_list)))

    for i, (X, y, title) in enumerate(zip(X_list, y_list, titles), 1):
        # Convert sparse to dense if needed
        if hasattr(X, "toarray"):
            X = X.toarray()

        # Subsample for speed
        if X.shape[0] > sample_size:
            idx = np.random.choice(X.shape[0], sample_size, replace=False)
            X = X[idx]
            y = y[idx]

        # Scale before PCA
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_scaled)

        plt.subplot(len(X_list), 1, i)
        for cls in np.unique(y):
            plt.scatter(X_pca[y == cls, 0], X_pca[y == cls, 1], label=f"Class {cls}", alpha=0.6, edgecolors='w', s=30)
        plt.title(title)
        plt.xlabel("PCA Component 1")
        plt.ylabel("PCA Component 2")
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# Extract numeric columns only for PCA on raw data (for smote_nc and original)
X_train_numeric = X_train[NumCols].values

# For smote_nc output (raw data), select numeric cols for PCA
X_train_smote_nc_numeric = X_train_smote_nc[NumCols].values if hasattr(X_train_smote_nc, 'iloc') else X_train_smote_nc[:, :len(NumCols)]

# Prepare data for plotting
X_list = [
    X_train_numeric,
    X_train_processed,
    X_train_smote_nc_numeric,
    X_train_rus
]

y_list = [
    y_train,
    y_train,
    y_smote_nc,
    y_train_rus
]

titles = [
    "Original Training Data (Numeric Features Only)",
    "Processed Training Data (Numeric + One-Hot)",
    "After SMOTENC (Raw Data, Numeric Features Only for PCA)",
    "After RandomUnderSampler (Processed Numeric + One-Hot)"
]

plot_2d_pca(X_list, y_list, titles)


NameError: name 'X_train' is not defined

### Sampling Method 4

# Modelling

## Model Building

### Common functions for reproducibility

### Base model

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42, max_iter=1000)

### Models with sampling method 1

### Models with sampling method 2

### Models with sampling method 3

### Models with sampling method 4

This final sampling method is a hybrid method of sampling which combines SMOTE-NC and RUS

https://www.mdpi.com/2078-2489/14/1/54

#### SMOTE-NC

In [None]:
# Identify categorical columns (indices)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
categorical_indices = [X.columns.get_loc(col) for col in categorical_features]

# Initialize SMOTE-NC
desired_ratio = 0.8  # example: make minority class 80% of majority
smote_nc = SMOTENC(categorical_features=categorical_indices, sampling_strategy=desired_ratio, k_neighbors=5, random_state=42)

# Fit and resample
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

#### RUS

In [None]:
# Initialize RUS to match minority class count
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Apply undersampling
X_hybrid, y_hybrid = rus.fit_resample(X_resampled, y_resampled)


### Visualize the differences

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), ncols=3, nrows=1)

# Original y distribution
sns.countplot(x=y, ax=ax[0])
ax[0].set_title('Original y Distribution')

# Resampled y distribution after SMOTE-NC
sns.countplot(x=y_resampled, ax=ax[1])
ax[1].set_title('y Distribution after SMOTE-NC')

# Final y distribution after undersampling
sns.countplot(x=y_hybrid, ax=ax[2])
ax[2].set_title('y Distribution after Undersampling')
plt.tight_layout()
plt.show()


## Model Validation and Hyperparameter Tuning

## Model Evaluation

# Implications

In [1]:
test

NameError: name 'test' is not defined

# 