# Data Preparation and Preprocessing

## 1. Import Dependencies

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

## 2. Load Raw Data

In [None]:
df = pd.read_csv('data/raw/dataset.csv')
print("Successfully loaded dataset. Shape: ", df.shape)
print("Columns: ", df.columns.tolist())

## 3. Feature Engineering and Preprocessing

### 3.1. Define Feature Categories

In [None]:
remainder_features = ['NumOfProducts', 'HasCrCard', 'IsActiveMember']
numerical_features = ['Age', 'Tenure', 'Balance', 'EstimatedSalary']
nominal_features = ['Gender', 'Geography']
ordinal_features = ['CreditScoreBins']

### 3.2. Create Preprocessing Pipelines

In [None]:
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

nominal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder())
    ]
)

### 3.3. Combine Pipelines with ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features),
    ],
    remainder='drop'
)

### 3.4. Apply Transformations and Create Final DataFrame

In [None]:
df_cp = df.copy()
transformed_data = preprocessor.fit_transform(df_cp)

# Get feature names after one-hot encoding
nominal_feature_name = preprocessor.named_transformers_['nom']['encoder'].get_feature_names_out(nominal_features)

df_transformed = pd.DataFrame(
    transformed_data,
    columns=numerical_features + list(nominal_feature_name) + ordinal_features
)

df_remainder = df[remainder_features]

df_final = pd.concat(
    [df_transformed, df_remainder, df_cp.Exited],
    axis=1
)

print("Final preprocessed DataFrame head:")
display(df_final.head())

### 3.5. Save Preprocessed Data

In [None]:
df_final.to_csv('data/processed/x_transformed.csv', index=False)
print("Preprocessed data saved to 'data/processed/x_transformed.csv'")

## 4. Handle Class Imbalance using SMOTE

First, we split the data into training and testing sets. Then, we apply SMOTE only to the training data to prevent data leakage.

In [None]:
X = df_final.drop(columns=['Exited'])
Y = df_final['Exited']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
print("Original class distribution in training data:")
print(Y_train.value_counts())

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

print("\nResampled class distribution in training data:")
print(pd.Series(Y_train_resampled).value_counts())

### 4.1. Visualize Class Distribution

In [None]:
plt.figure(figsize=(18, 5))

plt.subplot(1, 3, 1)
Y_train.value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.ylabel('Count')
plt.xlabel('Class')
plt.title('Class Distribution Before SMOTE')
plt.xticks(rotation=0)

plt.subplot(1, 3, 2)
pd.Series(Y_train_resampled).value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.ylabel('Count')
plt.xlabel('Class')
plt.title('Class Distribution After SMOTE')
plt.xticks(rotation=0)

plt.subplot(1, 3, 3)
Y_test.value_counts().plot(kind='bar', color=['lightgreen', 'gold'])
plt.ylabel('Count')
plt.xlabel('Class')
plt.title('Class Distribution in Test Set')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

## 5. Save Processed Data Artifacts

In [None]:
np.savez('artifacts/X_train.npz', X_train_resampled)
np.savez('artifacts/Y_train.npz', Y_train_resampled)
np.savez('artifacts/X_test.npz', X_test.values)
np.savez('artifacts/Y_test.npz', Y_test.values)

print("Saved processed data artifacts to the 'artifacts/' directory.")
print(f"X_train_resampled shape: {X_train_resampled.shape}")
print(f"Y_train_resampled shape: {Y_train_resampled.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")