# Step 0: Data Preparation and Preprocessing (Fixed)

## 1. Import Dependencies

First, we need to load all the tools (libraries) we need for our project.

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

## 2. Load Raw Data

Here, we load our customer dataset from the file.

In [None]:
df = pd.read_csv('data/raw/dataset.csv')
print("Successfully loaded dataset.")
print(f"Shape: {df.shape}")
print("First 5 rows:")
display(df.head())

## 3. Feature Engineering and Preprocessing

This is where we clean and prepare our data for the model.

### Mnemonic: **P-P-T** (Pipelines, Preprocessor, Transform)
*   **P** - **Pipelines**: Create separate cleaning steps for different types of data (numbers, categories).
*   **P** - **Preprocessor**: Combine all the pipelines into one single tool.
*   **T** - **Transform**: Use the preprocessor to clean the entire dataset.

### 3.1. Define Feature Categories

We group our columns based on their data type.

In [None]:
# Define feature categories according to product requirements
numerical_features = ['Age', 'Tenure', 'Balance', 'EstimatedSalary']
nominal_features = ['Gender', 'Geography']
ordinal_features = ['CreditScoreBins']
remainder_features = ['NumOfProducts', 'HasCrCard', 'IsActiveMember']
target_variable = 'Exited'

print(f"Numerical features: {numerical_features}")
print(f"Nominal features: {nominal_features}")
print(f"Ordinal features: {ordinal_features}")
print(f"Remainder features: {remainder_features}")
print(f"Target variable: {target_variable}")

### 3.2. Create Preprocessing Pipelines

We create a set of steps (a "pipeline") for each data type to handle missing values and scale the data correctly.

In [None]:
# Create preprocessing pipelines for different feature types
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

nominal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]
)

ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]
)

print("✅ Preprocessing pipelines created successfully")

### 3.3. Combine Pipelines with ColumnTransformer

Now, we combine all our pipelines into a single, powerful preprocessor.

In [None]:
# Create the main preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features),
    ],
    remainder='passthrough'  # Keep the remainder features as-is
)

print("✅ ColumnTransformer created successfully")

### 3.4. Apply Transformations and Create Final DataFrame

We use our preprocessor to transform the data and create a final, clean DataFrame.

In [None]:
# 🛡️ Step 1: Create safe copy of original data
print("📋 Creating safe copy of data...")
df_copy = df.copy()

# 🔧 Step 2: Apply all preprocessing transformations
print("⚙️ Applying preprocessing pipeline...")
transformed_array = preprocessor.fit_transform(df_copy)
print(f"✅ Transformed shape: {transformed_array.shape}")

# 🏷️ Step 3: Reconstruct column names (sklearn loses them)
print("🏷️ Reconstructing column names...")

# Get the feature names from each transformer
try:
    # Get expanded names for one-hot encoded features
    nominal_expanded_names = preprocessor.named_transformers_['nom']['encoder'].get_feature_names_out(nominal_features)
    
    # The ColumnTransformer outputs columns in the order they were defined:
    # 1. numerical features (4 columns)
    # 2. nominal features (one-hot encoded, variable number)
    # 3. ordinal features (1 column) 
    # 4. remainder features (3 columns) - these are added by remainder='passthrough'
    
    final_feature_names = (
        numerical_features +           # Age, Tenure, Balance, EstimatedSalary
        list(nominal_expanded_names) + # Gender_Male, Geography_France, etc.
        ordinal_features +             # CreditScoreBins  
        remainder_features             # NumOfProducts, HasCrCard, IsActiveMember
    )
    
    print(f"📊 Transformed array shape: {transformed_array.shape}")
    print(f"📊 Total feature names: {len(final_feature_names)}")
    print(f"   Numerical: {len(numerical_features)} features")
    print(f"   Nominal (expanded): {len(nominal_expanded_names)} features")
    print(f"   Ordinal: {len(ordinal_features)} features")
    print(f"   Remainder: {len(remainder_features)} features")
    
    # Check if shapes match
    if transformed_array.shape[1] != len(final_feature_names):
        print(f"⚠️  Shape mismatch detected!")
        print(f"   Array columns: {transformed_array.shape[1]}")
        print(f"   Feature names: {len(final_feature_names)}")
        
        # Use generic column names as fallback
        final_feature_names = [f'feature_{i}' for i in range(transformed_array.shape[1])]
        print(f"   Using generic names: {len(final_feature_names)} columns")
        
except Exception as e:
    print(f"⚠️  Error in column naming: {e}")
    # Fallback to generic column names
    final_feature_names = [f'feature_{i}' for i in range(transformed_array.shape[1])]
    print(f"   Using generic names: {len(final_feature_names)} columns")

# 📊 Step 4: Convert back to DataFrame with proper column names
df_features = pd.DataFrame(transformed_array, columns=final_feature_names)

# 🎯 Step 5: Add target variable back
df_final = pd.concat([
    df_features, 
    df[target_variable].reset_index(drop=True)
], axis=1)

print("🎉 Preprocessing complete!")
print(f"📏 Final dataset shape: {df_final.shape}")
display(df_final.head())

## 4. Handle Class Imbalance using SMOTE

Our data has many more 'Not Churn' customers than 'Churn' customers. This can bias our model. We use **SMOTE** to create new, synthetic 'Churn' data points to balance things out.

### Mnemonic: **S-B-V** (Split, Balance, Visualize)
*   **S** - **Split**: First, split data into training and testing sets.
*   **B** - **Balance**: Apply SMOTE *only* to the training data.
*   **V** - **Visualize**: Create charts to see the data before and after balancing.

In [None]:
# Separate features and target
X = df_final.drop(columns=[target_variable])
Y = df_final[target_variable]

print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")
print(f"Target distribution:")
print(Y.value_counts())

In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.2, 
    random_state=42, 
    stratify=Y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print("\nClass distribution before SMOTE:")
print(Y_train.value_counts())

In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

print("\nClass distribution after SMOTE:")
print(pd.Series(Y_train_resampled).value_counts())
print(f"\nResampled training set shape: {X_train_resampled.shape}")

### 4.1. Visualize Class Distribution

In [None]:
# Create visualization of class distribution before and after SMOTE
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Class Distribution Comparison', fontsize=16)

# Before SMOTE
Y_train.value_counts().plot(kind='bar', ax=axes[0], color=['skyblue', 'salmon'])
axes[0].set_title('Before SMOTE (Training Set)')
axes[0].set_ylabel('Count')
axes[0].set_xlabel('Class (0=Not Churned, 1=Churned)')
axes[0].tick_params(axis='x', rotation=0)

# After SMOTE
pd.Series(Y_train_resampled).value_counts().plot(kind='bar', ax=axes[1], color=['skyblue', 'salmon'])
axes[1].set_title('After SMOTE (Training Set)')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Class (0=Not Churned, 1=Churned)')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 5. Save Processed Data

Let's save our processed data for use in subsequent notebooks.

In [None]:
# Create artifacts directory if it doesn't exist
import os
os.makedirs('artifacts', exist_ok=True)

# Save the processed datasets
np.savez('artifacts/X_train.npz', X_train_resampled)
np.savez('artifacts/Y_train.npz', Y_train_resampled)
np.savez('artifacts/X_test.npz', X_test)
np.savez('artifacts/Y_test.npz', Y_test)

print("✅ Processed data saved to artifacts/")
print(f"   X_train_resampled: {X_train_resampled.shape}")
print(f"   Y_train_resampled: {Y_train_resampled.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   Y_test: {Y_test.shape}")