In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… Libraries imported successfully!")


âœ… Libraries imported successfully!


In [20]:
# Load the dataset
df = pd.read_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print("âœ… Dataset loaded!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


âœ… Dataset loaded!
Shape: (7043, 21)

Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

First few rows:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [22]:
# TotalCharges is stored as object (string) - needs to be numeric
print("Checking TotalCharges column:")
print(f"Data type: {df['TotalCharges'].dtype}")
print(f"\nSample values:")
print(df['TotalCharges'].head(10))

# Check for non-numeric values
print(f"\nChecking for spaces or empty strings...")
print(f"Rows with spaces: {(df['TotalCharges'] == ' ').sum()}")

# Convert to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
print(f"\nMissing values after conversion: {df['TotalCharges'].isnull().sum()}")

# For customers with 0 tenure, TotalCharges should be 0
df.loc[df['tenure'] == 0, 'TotalCharges'] = 0

# For others, fill with median or use MonthlyCharges * tenure estimate
df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'], inplace=True)

print(f"\nâœ… TotalCharges cleaned! Missing values: {df['TotalCharges'].isnull().sum()}")

Checking TotalCharges column:
Data type: object

Sample values:
0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
5      820.5
6     1949.4
7      301.9
8    3046.05
9    3487.95
Name: TotalCharges, dtype: object

Checking for spaces or empty strings...
Rows with spaces: 11

Missing values after conversion: 11

âœ… TotalCharges cleaned! Missing values: 0


In [26]:
print("Creating basic engineered features...")

# 1. Tenure grouping (categorical from continuous)
df['tenure_group'] = pd.cut(df['tenure'], 
                             bins=[0, 12, 24, 48, 72],
                             labels=['0-1 year', '1-2 years', '2-4 years', '4-6 years'])

# 2. Average monthly spend
df['AvgMonthlySpend'] = df['TotalCharges'] / (df['tenure'] + 1)  # +1 to avoid division by zero

# 3. Charge ratio (how much of total is monthly vs accumulated)
df['ChargeRatio'] = df['MonthlyCharges'] / (df['TotalCharges'] + 1)

# 4. Service count (how many services customer has)
service_cols = ['PhoneService', 'InternetService', 'OnlineSecurity', 
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                'StreamingTV', 'StreamingMovies']

# Count 'Yes' responses across service columns
df['TotalServices'] = 0
for col in service_cols:
    if col == 'PhoneService' or col == 'InternetService':
        df['TotalServices'] += (df[col] == 'Yes').astype(int) | (df[col] == 'DSL').astype(int) | (df[col] == 'Fiber optic').astype(int)
    else:
        df['TotalServices'] += (df[col] == 'Yes').astype(int)

# 5. Has internet service (binary)
df['HasInternetService'] = (df['InternetService'] != 'No').astype(int)

# 6. Has phone service (binary)
df['HasPhoneService'] = (df['PhoneService'] == 'Yes').astype(int)

# 7. Paperless billing (binary)
df['IsPaperlessBilling'] = (df['PaperlessBilling'] == 'Yes').astype(int)

# 8. Senior citizen already binary (0/1)

# 9. Has partner or dependents
df['HasPartner'] = (df['Partner'] == 'Yes').astype(int)
df['HasDependents'] = (df['Dependents'] == 'Yes').astype(int)
df['FamilySize'] = df['HasPartner'] + df['HasDependents']

# 10. Contract type risk (month-to-month is risky)
df['IsMonthToMonth'] = (df['Contract'] == 'Month-to-month').astype(int)

print("âœ… Basic features created!")
print(f"\nNew features: {[col for col in df.columns if col not in pd.read_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv').columns]}")
print(f"\nDataset shape now: {df.shape}")

Creating basic engineered features...
âœ… Basic features created!

New features: ['tenure_group', 'AvgMonthlySpend', 'ChargeRatio', 'TotalServices', 'HasInternetService', 'HasPhoneService', 'IsPaperlessBilling', 'HasPartner', 'HasDependents', 'FamilySize', 'IsMonthToMonth']

Dataset shape now: (7043, 32)


In [28]:
print("Creating ADVANCED RFM-style features...")

# RFM = Recency, Frequency, Monetary (adapted for telco)

# 1. RECENCY proxy: Inverse of tenure (newer customers = higher recency score)
df['RecencyScore'] = 1 / (df['tenure'] + 1)

# 2. MONETARY: Total spending tier
df['MonetaryValue'] = df['TotalCharges']
df['MonetaryTier'] = pd.qcut(df['MonetaryValue'], q=4, labels=['Low', 'Medium', 'High', 'VIP'], duplicates='drop')

# 3. FREQUENCY proxy: Services per month of tenure
df['ServiceFrequency'] = df['TotalServices'] / (df['tenure'] + 1)

# 4. Customer Lifetime Value (CLV) estimate
# CLV = Monthly Charges * Expected remaining tenure (assuming avg customer stays 30 months)
avg_tenure = 30  # months
df['EstimatedCLV'] = df['MonthlyCharges'] * (avg_tenure - df['tenure']).clip(lower=0)

# 5. Payment method risk (electronic check has higher churn)
df['IsElectronicCheck'] = (df['PaymentMethod'] == 'Electronic check').astype(int)

# 6. Service adoption rate (% of available services customer uses)
max_services = 8  # total possible services
df['ServiceAdoptionRate'] = df['TotalServices'] / max_services

# 7. Monthly charges per service (efficiency metric)
df['ChargesPerService'] = df['MonthlyCharges'] / (df['TotalServices'] + 1)

# 8. Fiber optic premium (fiber users pay more)
df['HasFiberOptic'] = (df['InternetService'] == 'Fiber optic').astype(int)

# 9. No internet service flag
df['NoInternetService'] = (df['InternetService'] == 'No').astype(int)

# 10. Full protection package (security + backup + device protection)
df['HasFullProtection'] = ((df['OnlineSecurity'] == 'Yes') & 
                           (df['OnlineBackup'] == 'Yes') & 
                           (df['DeviceProtection'] == 'Yes')).astype(int)

print("âœ… Advanced RFM features created!")
print(f"\nTotal features now: {df.shape[1]}")

Creating ADVANCED RFM-style features...
âœ… Advanced RFM features created!

Total features now: 43


In [34]:
print("Creating SYNTHETIC TEMPORAL DATA for real-time simulation...")
print("This simulates customer behavior events over time.\n")

# We'll create a time-series of events for each customer
# This allows us to build "real-time" features later

import random
from datetime import datetime, timedelta
import os

# For each customer, generate monthly event data
event_data = []

for idx, row in df.iterrows():
    customer_id = row['customerID']
    tenure_months = int(row['tenure'])
    monthly_charges = row['MonthlyCharges']
    has_internet = row['HasInternetService']
    
    # Generate events for each month of tenure
    for month in range(max(1, tenure_months)):
        # Simulate monthly activity
        month_events = {
            'customerID': customer_id,
            'month': month,
            
            # Usage patterns (higher for active customers)
            'login_count': np.random.poisson(10 if month < tenure_months/2 else 5),
            'data_usage_gb': np.random.normal(50, 20) if has_internet else 0,
            'call_minutes': np.random.normal(300, 100),
            
            # Support interactions (higher before churn)
            'support_tickets': np.random.poisson(0.5 if month < tenure_months - 3 else 2),
            'support_calls': np.random.poisson(0.3 if month < tenure_months - 3 else 1.5),
            
            # Payment behavior
            'late_payment': 1 if np.random.random() < 0.05 else 0,
            'payment_amount': monthly_charges + np.random.normal(0, 5),
            
            # Feature usage
            'feature_usage_score': np.random.beta(2, 2) * 100,  # 0-100 score
        }
        event_data.append(month_events)

# Convert to DataFrame
events_df = pd.DataFrame(event_data)

print(f"âœ… Generated {len(events_df):,} synthetic event records!")
print(f"Events per customer (avg): {len(events_df) / len(df):.1f}")
print(f"\nSample events:")
print(events_df.head(10))

# Create directory if it doesn't exist
os.makedirs('/Users/kashishpatel/Desktop/customer-churn-project/data/processed', exist_ok=True)

# Save events data
events_df.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/customer_events.csv', index=False)
print("\nâœ… Events data saved to: data/processed/customer_events.csv")

Creating SYNTHETIC TEMPORAL DATA for real-time simulation...
This simulates customer behavior events over time.

âœ… Generated 228,001 synthetic event records!
Events per customer (avg): 32.4

Sample events:
   customerID  month  login_count  data_usage_gb  call_minutes  \
0  7590-VHVEG      0           12      21.354965    320.183645   
1  5575-GNVDE      0           10      50.207464    287.920529   
2  5575-GNVDE      1           13      50.900032    384.322833   
3  5575-GNVDE      2           10      62.955763    278.500267   
4  5575-GNVDE      3           10      30.923753    360.786282   
5  5575-GNVDE      4           11      46.697912    187.259456   
6  5575-GNVDE      5            8      12.484288    224.198214   
7  5575-GNVDE      6           16      57.523707    418.878702   
8  5575-GNVDE      7           12      54.604706    260.517373   
9  5575-GNVDE      8           16      20.087504    127.119065   

   support_tickets  support_calls  late_payment  payment_amount  

In [36]:
print("Aggregating temporal features from event data...")

# Calculate aggregated metrics from events for each customer
temporal_features = events_df.groupby('customerID').agg({
    'login_count': ['mean', 'std', 'sum'],
    'data_usage_gb': ['mean', 'max'],
    'call_minutes': ['mean', 'sum'],
    'support_tickets': ['sum', 'mean'],
    'support_calls': ['sum'],
    'late_payment': ['sum'],
    'feature_usage_score': ['mean', 'std', 'min']
}).reset_index()

# Flatten column names
temporal_features.columns = ['_'.join(col).strip('_') for col in temporal_features.columns]
temporal_features.rename(columns={'customerID': 'customerID'}, inplace=True)

# Calculate trend features (last 3 months vs first 3 months)
recent_activity = events_df.groupby('customerID').tail(3).groupby('customerID')['login_count'].mean().reset_index()
recent_activity.columns = ['customerID', 'recent_login_avg']

early_activity = events_df.groupby('customerID').head(3).groupby('customerID')['login_count'].mean().reset_index()
early_activity.columns = ['customerID', 'early_login_avg']

# Merge trend data
temporal_features = temporal_features.merge(recent_activity, on='customerID', how='left')
temporal_features = temporal_features.merge(early_activity, on='customerID', how='left')

# Calculate engagement trend (positive = increasing engagement, negative = declining)
temporal_features['engagement_trend'] = (temporal_features['recent_login_avg'] - 
                                         temporal_features['early_login_avg'])

print(f"âœ… Temporal features created!")
print(f"Shape: {temporal_features.shape}")
print(f"\nTemporal feature columns:")
print(temporal_features.columns.tolist())

# Merge with main dataset
df = df.merge(temporal_features, on='customerID', how='left')
print(f"\nâœ… Merged with main dataset! New shape: {df.shape}")

Aggregating temporal features from event data...
âœ… Temporal features created!
Shape: (7043, 18)

Temporal feature columns:
['customerID', 'login_count_mean', 'login_count_std', 'login_count_sum', 'data_usage_gb_mean', 'data_usage_gb_max', 'call_minutes_mean', 'call_minutes_sum', 'support_tickets_sum', 'support_tickets_mean', 'support_calls_sum', 'late_payment_sum', 'feature_usage_score_mean', 'feature_usage_score_std', 'feature_usage_score_min', 'recent_login_avg', 'early_login_avg', 'engagement_trend']

âœ… Merged with main dataset! New shape: (7043, 60)


In [38]:
print("Encoding categorical variables...")

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('customerID')  # Don't encode ID
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')  # We'll handle target separately

print(f"\nCategorical columns to encode: {categorical_cols}")

# Create a copy for encoding
df_encoded = df.copy()

# Binary encoding for Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    if col in df_encoded.columns:
        df_encoded[col] = (df_encoded[col] == 'Yes').astype(int)

# Label encoding for ordinal features (Contract has order: Month-to-month < One year < Two year)
contract_mapping = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df_encoded['Contract'] = df_encoded['Contract'].map(contract_mapping)

# One-hot encoding for nominal categorical features
nominal_cols = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                'StreamingMovies', 'PaymentMethod', 'tenure_group', 'MonetaryTier']

# Filter to only existing columns
nominal_cols = [col for col in nominal_cols if col in df_encoded.columns]

print(f"\nOne-hot encoding: {nominal_cols}")
df_encoded = pd.get_dummies(df_encoded, columns=nominal_cols, drop_first=True, dtype=int)

# Encode target variable
df_encoded['Churn'] = (df_encoded['Churn'] == 'Yes').astype(int)

print(f"\nâœ… Encoding complete!")
print(f"Final dataset shape: {df_encoded.shape}")
print(f"Total features (including one-hot): {df_encoded.shape[1] - 2}")  # -2 for customerID and Churn

Encoding categorical variables...

Categorical columns to encode: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

One-hot encoding: ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaymentMethod', 'tenure_group', 'MonetaryTier']

âœ… Encoding complete!
Final dataset shape: (7043, 74)
Total features (including one-hot): 72


In [40]:
print("Preparing final dataset for modeling...")

# Drop customerID (not needed for training)
df_model = df_encoded.drop('customerID', axis=1)

# Separate features and target
X = df_model.drop('Churn', axis=1)
y = df_model['Churn']

print(f"âœ… Final dataset prepared!")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())
print(f"\nChurn rate: {y.mean() * 100:.2f}%")

# Check for any remaining missing values
print(f"\nMissing values check:")
print(f"Features: {X.isnull().sum().sum()}")
print(f"Target: {y.isnull().sum()}")

# Fill any remaining NaN with 0 (from temporal features merge)
X = X.fillna(0)

print("\nâœ… All missing values handled!")

Preparing final dataset for modeling...
âœ… Final dataset prepared!
Features (X) shape: (7043, 72)
Target (y) shape: (7043,)

Class distribution:
Churn
0    5174
1    1869
Name: count, dtype: int64

Churn rate: 26.54%

Missing values check:
Features: 1248
Target: 0

âœ… All missing values handled!


In [42]:
print("Splitting data into train and test sets...")

# Stratified split to maintain churn ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # IMPORTANT: maintains class balance
)

print(f"âœ… Data split complete!")
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print(f"\nChurn distribution in training set:")
print(y_train.value_counts())
print(f"Churn rate: {y_train.mean() * 100:.2f}%")

print(f"\nChurn distribution in test set:")
print(y_test.value_counts())
print(f"Churn rate: {y_test.mean() * 100:.2f}%")

Splitting data into train and test sets...
âœ… Data split complete!

Training set: 5634 samples
Test set: 1409 samples

Churn distribution in training set:
Churn
0    4139
1    1495
Name: count, dtype: int64
Churn rate: 26.54%

Churn distribution in test set:
Churn
0    1035
1     374
Name: count, dtype: int64
Churn rate: 26.54%


In [44]:
print("Scaling numerical features...")

# Scale features (important for neural networks and some algorithms)
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("âœ… Features scaled!")
print(f"\nScaled training set mean (should be ~0): {X_train_scaled.mean().mean():.6f}")
print(f"Scaled training set std (should be ~1): {X_train_scaled.std().mean():.6f}")

Scaling numerical features...
âœ… Features scaled!

Scaled training set mean (should be ~0): -0.000000
Scaled training set std (should be ~1): 1.000089


In [46]:
import os
import pickle

print("Saving processed data...")

# Create processed data directory
os.makedirs('/Users/kashishpatel/Desktop/customer-churn-project/data/processed', exist_ok=True)

# Save datasets
X_train.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/X_train.csv', index=False)
X_test.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/X_test.csv', index=False)
y_train.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/y_train.csv', index=False)
y_test.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/y_test.csv', index=False)

X_train_scaled.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/X_train_scaled.csv', index=False)
X_test_scaled.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/X_test_scaled.csv', index=False)

# Save full encoded dataset
df_encoded.to_csv('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/telco_churn_processed.csv', index=False)

# Save scaler
with open('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names
feature_names = X_train.columns.tolist()
with open('/Users/kashishpatel/Desktop/customer-churn-project/data/processed/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

print("âœ… All processed data saved!")
print(f"\nSaved files:")
print("  - X_train.csv, X_test.csv")
print("  - y_train.csv, y_test.csv")
print("  - X_train_scaled.csv, X_test_scaled.csv")
print("  - telco_churn_processed.csv")
print("  - scaler.pkl")
print("  - feature_names.pkl")
print("  - customer_events.csv")

Saving processed data...
âœ… All processed data saved!

Saved files:
  - X_train.csv, X_test.csv
  - y_train.csv, y_test.csv
  - X_train_scaled.csv, X_test_scaled.csv
  - telco_churn_processed.csv
  - scaler.pkl
  - feature_names.pkl
  - customer_events.csv


In [48]:
print("="*80)
print("PHASE 1 COMPLETE: DATA PREPROCESSING & FEATURE ENGINEERING")
print("="*80)

print("\nðŸ“Š WHAT WE ACCOMPLISHED:")
print("  âœ… Loaded and cleaned raw data")
print("  âœ… Handled missing values and data types")
print("  âœ… Created 20+ engineered features:")
print("     â€¢ Basic features (tenure groups, service counts)")
print("     â€¢ RFM-style metrics (recency, monetary, frequency)")
print("     â€¢ Synthetic temporal/behavioral data")
print("     â€¢ Engagement trends and patterns")
print("  âœ… Encoded categorical variables")
print("  âœ… Performed stratified train-test split")
print("  âœ… Scaled features for modeling")
print("  âœ… Saved all processed data")

print(f"\nðŸ“ˆ DATASET STATISTICS:")
print(f"  â€¢ Original features: 20")
print(f"  â€¢ Engineered features: {X_train.shape[1]}")
print(f"  â€¢ Training samples: {X_train.shape[0]:,}")
print(f"  â€¢ Test samples: {X_test.shape[0]:,}")
print(f"  â€¢ Churn rate: {y_train.mean() * 100:.2f}%")

print("\nðŸŽ¯ NEXT PHASE:")
print("  PHASE 2: Model Development & Training")
print("  â€¢ Train multiple models (XGBoost, LightGBM, Neural Network)")
print("  â€¢ Handle class imbalance with SMOTE")
print("  â€¢ Hyperparameter tuning")
print("  â€¢ Model evaluation with multiple metrics")
print("  â€¢ Experiment tracking with MLflow")

print("\n" + "="*80)

PHASE 1 COMPLETE: DATA PREPROCESSING & FEATURE ENGINEERING

ðŸ“Š WHAT WE ACCOMPLISHED:
  âœ… Loaded and cleaned raw data
  âœ… Handled missing values and data types
  âœ… Created 20+ engineered features:
     â€¢ Basic features (tenure groups, service counts)
     â€¢ RFM-style metrics (recency, monetary, frequency)
     â€¢ Synthetic temporal/behavioral data
     â€¢ Engagement trends and patterns
  âœ… Encoded categorical variables
  âœ… Performed stratified train-test split
  âœ… Scaled features for modeling
  âœ… Saved all processed data

ðŸ“ˆ DATASET STATISTICS:
  â€¢ Original features: 20
  â€¢ Engineered features: 72
  â€¢ Training samples: 5,634
  â€¢ Test samples: 1,409
  â€¢ Churn rate: 26.54%

ðŸŽ¯ NEXT PHASE:
  PHASE 2: Model Development & Training
  â€¢ Train multiple models (XGBoost, LightGBM, Neural Network)
  â€¢ Handle class imbalance with SMOTE
  â€¢ Hyperparameter tuning
  â€¢ Model evaluation with multiple metrics
  â€¢ Experiment tracking with MLflow

