In [None]:
import numpy as np
import scipy.io as sio
import pickle

# Load MNIST dataset
mnist_data = sio.loadmat('C:/Users/kvkgi/Downloads/Project 2/Project 2/basecode/mnist_all.mat')

# Load CelebA dataset
with open('C:/Users/kvkgi/Downloads/Project 2/Project 2/basecode/face_all.pickle', 'rb') as f:
    celeb_data = pickle.load(f)

# MNIST data contains training and testing data matrices for each digit (0 to 9)
# CelebA data contains a data matrix and a corresponding labels vector


## 1.2 Splitting MNIST Training Data into Training and Validation Sets
We'll randomly split the 60,000 MNIST training samples into 50,000 for training and 10,000 for validation.

In [None]:
def split_mnist_data(mnist_data, validation_size=10000):
    train_data, train_labels = [], []
    test_data, test_labels = [], []

    # Concatenate data from all classes
    for i in range(10):
        train_data.append(mnist_data[f'train{i}'])
        test_data.append(mnist_data[f'test{i}'])
        train_labels.append(np.full(mnist_data[f'train{i}'].shape[0], i))
        test_labels.append(np.full(mnist_data[f'test{i}'].shape[0], i))
    
    train_data = np.vstack(train_data)
    train_labels = np.concatenate(train_labels)
    test_data = np.vstack(test_data)
    test_labels = np.concatenate(test_labels)
    
    # Shuffle and split for validation
    indices = np.arange(train_data.shape[0])
    np.random.shuffle(indices)
    validation_data = train_data[indices[:validation_size]]
    validation_labels = train_labels[indices[:validation_size]]
    train_data = train_data[indices[validation_size:]]
    train_labels = train_labels[indices[validation_size:]]

    return train_data, train_labels, validation_data, validation_labels, test_data, test_labels

train_data, train_labels, val_data, val_labels, test_data, test_labels = split_mnist_data(mnist_data)


## 1.3 Feature Selection
Remove features with the same value across all samples.

In [None]:
def feature_selection(data):
    # Remove features that have the same value across all samples
    feature_variability = np.std(data, axis=0) > 0
    return data[:, feature_variability], feature_variability

# Apply feature selection to train and validation data
train_data, selected_features = feature_selection(train_data)
val_data = val_data[:, selected_features]
test_data = test_data[:, selected_features]


In [None]:
def initialize_weights(in_units, out_units):
    epsilon = np.sqrt(2.0 / (in_units + out_units))  # He initialization for ReLU
    return np.random.randn(out_units, in_units + 1) * epsilon  # +1 for bias


## 2.2 Sigmoid Activation Function
The sigmoid function will be used as the activation function.

In [None]:
def sigmoid(z):
    # To avoid overflow, clip values of z within a range
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))



## 2.3 Neural Network Objective Function (nnObjFunction)
This function computes the error and its gradient using backpropagation.

In [None]:
# Define the ReLU activation function and its derivative
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)


def apply_dropout(layer_output, dropout_rate):
    dropout_mask = (np.random.rand(*layer_output.shape) < (1 - dropout_rate)).astype(float)
    return layer_output * dropout_mask

def nn_obj_function(params, *args, is_training=True, dropout_rate=0.5):
    n_input, n_hidden, n_class, train_data, train_labels, lambda_reg = args
    w1 = params[0:n_hidden * (n_input + 1)].reshape((n_hidden, (n_input + 1)))
    w2 = params[(n_hidden * (n_input + 1)):].reshape((n_class, (n_hidden + 1)))
    
    # Forward pass
    train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1))))  # Add bias to input
    z = relu(np.dot(train_data, w1.T))  # Hidden layer activations
    
    # Apply dropout only during training
    if is_training:
        z = apply_dropout(z, dropout_rate)

    z = np.hstack((z, np.ones((z.shape[0], 1))))  # Add bias to hidden layer
    o = sigmoid(np.dot(z, w2.T))  # Output layer activations

    # Clip predictions to avoid log(0)
    epsilon = 1e-10
    o = np.clip(o, epsilon, 1 - epsilon)
    
    # Calculate error with regularization
    y = np.zeros((train_data.shape[0], n_class))
    y[np.arange(train_data.shape[0]), train_labels] = 1
    error = -np.sum(y * np.log(o) + (1 - y) * np.log(1 - o)) / train_data.shape[0]
    error += (lambda_reg / (2 * train_data.shape[0])) * (np.sum(w1 ** 2) + np.sum(w2 ** 2))
    
    # Backpropagation
    delta = o - y
    grad_w2 = np.dot(delta.T, z) / train_data.shape[0] + (lambda_reg * w2) / train_data.shape[0]
    grad_hidden = np.dot(delta, w2) * relu_derivative(z)
    grad_hidden = grad_hidden[:, :-1]  # Remove bias gradient
    grad_w1 = np.dot(grad_hidden.T, train_data) / train_data.shape[0] + (lambda_reg * w1) / train_data.shape[0]
    
    return error, np.concatenate((grad_w1.flatten(), grad_w2.flatten()))




In [3]:
# 2_Preprocessing_Modeling.ipynb

# 📌 Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

# 📌 Load Dataset
df = pd.read_csv('D:/projects/Customer-Churn-Prediction/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 📌 Drop Irrelevant Column
df.drop(['customerID'], axis=1, inplace=True)

# 📌 Handle TotalCharges: Convert to numeric (some blanks need fixing)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.loc[:, 'TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# 📌 Encode Categorical Variables
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Churn':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# 📌 Encode Target Variable
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# 📌 Feature Scaling
scaler = StandardScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])

# Save scaler object
joblib.dump(scaler, 'D:/projects/Customer-Churn-Prediction/deployment/scaler.pkl')
print("✅ Scaler saved successfully!")

# 📌 Split Dataset
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 📌 Handle Class Imbalance using SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("After SMOTE:", np.bincount(y_train_res))

# 📌 Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

random_cv = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid,
                               n_iter=50, cv=3, verbose=2, scoring='accuracy', n_jobs=-1)

random_cv.fit(X_train_res, y_train_res)

print("🔎 Best Parameters:", random_cv.best_params_)
print("✅ Best Cross-Validation Accuracy:", random_cv.best_score_)

# 📌 Retrain best model
best_model = random_cv.best_estimator_
best_model.fit(X_train_res, y_train_res)

# 📌 Predictions
y_pred = best_model.predict(X_test)

# 📌 Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred))

# 📌 Save the best model
joblib.dump(best_model, 'D:/projects/Customer-Churn-Prediction/deployment/churn_model.pkl')
print("\n✅ Updated model saved as churn_model.pkl")


✅ Scaler saved successfully!
After SMOTE: [4139 4139]
Fitting 3 folds for each of 50 candidates, totalling 150 fits
🔎 Best Parameters: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.7}
✅ Best Cross-Validation Accuracy: 0.8166281016191191

Confusion Matrix:
 [[824 211]
 [115 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.80      0.83      1035
           1       0.55      0.69      0.61       374

    accuracy                           0.77      1409
   macro avg       0.71      0.74      0.72      1409
weighted avg       0.79      0.77      0.78      1409


ROC AUC Score: 0.74432431734222

✅ Updated model saved as churn_model.pkl
