In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, classification_report
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# **STEP 1:** DATASET PREPARATION

In [2]:
df = pd.read_csv("/content/Churn_Modelling.csv")

In [22]:
n_instances, n_attributes = df.shape
print(f"Number of Instances: {n_instances}")
print(f"Number of Attributes: {n_attributes}")

# 2. Report Class distribution [cite: 175]
print("\nClass Distribution (Exited):")
print(df['Exited'].value_counts())
print(df['Exited'].value_counts(normalize=True)) # Percentage

# 3. Display first 8-10 instances [cite: 176]
print("\nFirst 10 instances of the dataset:")
print(df.head(10))

Number of Instances: 10000
Number of Attributes: 14

Class Distribution (Exited):
Exited
0    7963
1    2037
Name: count, dtype: int64
Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64

First 10 instances of the dataset:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   
5          6    15574012       Chu          645     Spain    Male   44   
6          7    15592531  Bartlett          822    France    Male   50   
7          8    15656148    Obinna          376   Germany  Female   29   
8          9    15792365        He          501    France    Male   44   
9         10    15592389 

In [25]:
df_clean = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
print("Dropped columns: RowNumber, CustomerId, Surname")

df_clean = pd.get_dummies(df_clean, columns=['Geography'], drop_first=True)

le = LabelEncoder()
df_clean['Gender'] = le.fit_transform(df_clean['Gender'])
print("Encoded 'Geography' and 'Gender'.")

X = df_clean.drop('Exited', axis=1)
y = df_clean['Exited']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
print(f"Data split into Train ({X_train.shape[0]}) and Test ({X_test.shape[0]}) sets.")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("Feature scaling completed.")
print("\nPreprocessing complete. Data is ready for Model Training.")

Dropped columns: RowNumber, CustomerId, Surname
Encoded 'Geography' and 'Gender'.
Data split into Train (7000) and Test (3000) sets.
Feature scaling completed.

Preprocessing complete. Data is ready for Model Training.


# **STEP 2:** MANUAL ATTRIBUTE SELECTION CALCULATIONS

In [26]:
import pandas as pd
import numpy as np

# 1. Create the small synthetic dataset
data = {
    'Credit': ['High', 'Low', 'High', 'Low', 'Low', 'High', 'High', 'Low', 'High', 'Low'],
    'HasCard': ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No'],
    'Active': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'Exited': ['No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes']
}
df_small = pd.DataFrame(data)

# 2. Define Helper Functions
def calculate_entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = 0
    for count in counts:
        p = count / len(target_col)
        entropy -= p * np.log2(p)
    return entropy

def calculate_gini(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    impurity = 1
    for count in counts:
        p = count / len(target_col)
        impurity -= p**2
    return impurity

# 3. Calculate System Metrics
total_entropy = calculate_entropy(df_small['Exited'])
total_gini = calculate_gini(df_small['Exited'])

print(f"System Entropy (S): {total_entropy:.4f}")
print(f"System Gini (S): {total_gini:.4f}")

# 4. Calculate Metrics for Each Attribute
results = []
for col in ['Credit', 'HasCard', 'Active']:
    vals, counts = np.unique(df_small[col], return_counts=True)
    weighted_entropy = 0
    weighted_gini = 0
    split_info = 0

    for i in range(len(vals)):
        p = counts[i] / np.sum(counts)
        subset = df_small[df_small[col] == vals[i]]

        weighted_entropy += p * calculate_entropy(subset['Exited'])
        weighted_gini += p * calculate_gini(subset['Exited'])
        split_info -= p * np.log2(p)

    info_gain = total_entropy - weighted_entropy
    gain_ratio = info_gain / split_info if split_info != 0 else 0

    results.append({
        'Attribute': col,
        'Info Gain': info_gain,
        'Gain Ratio': gain_ratio,
        'Gini Index': weighted_gini
    })

results_df = pd.DataFrame(results)
print("\nAttribute Selection Measures:")
print(results_df)

System Entropy (S): 0.9710
System Gini (S): 0.4800

Attribute Selection Measures:
  Attribute  Info Gain  Gain Ratio  Gini Index
0    Credit   0.609987    0.609987    0.160000
1   HasCard   0.019973    0.020571    0.466667
2    Active   0.019973    0.020571    0.466667


# **STEP 3:** Implementing Classification Models

In [28]:
model_results = {}

dt_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

k_values = [3, 5, 11]
knn_results = {}

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)
    knn_results[f'kNN (k={k})'] = {
        'pred': y_pred_knn,
        'acc': accuracy_score(y_test, y_pred_knn)
    }

def print_evaluation(model_name, y_true, y_pred):
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    print("-" * 50)

print_evaluation("Decision Tree", y_test, y_pred_dt)

print_evaluation("Naïve Bayes", y_test, y_pred_nb)

for k_name, res in knn_results.items():
    print_evaluation(k_name, y_test, res['pred'])

--- Decision Tree ---
Accuracy: 0.7903
Confusion Matrix:
[[2062  327]
 [ 302  309]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      2389
           1       0.49      0.51      0.50       611

    accuracy                           0.79      3000
   macro avg       0.68      0.68      0.68      3000
weighted avg       0.79      0.79      0.79      3000

--------------------------------------------------
--- Naïve Bayes ---
Accuracy: 0.7893
Confusion Matrix:
[[2338   51]
 [ 581   30]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88      2389
           1       0.37      0.05      0.09       611

    accuracy                           0.79      3000
   macro avg       0.59      0.51      0.48      3000
weighted avg       0.71      0.79      0.72      3000

--------------------------------------------------
--- kNN (k=3) ---
Accuracy: 0.

# **STEP 4:** Evaluation & Cross-Validation

In [32]:
from sklearn.pipeline import make_pipeline
models = {
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=42),
    "Naive Bayes": GaussianNB(),
    "kNN (k=3)": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3)),
    "kNN (k=5)": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)),
    "kNN (k=11)": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=11))
}

results = []

print(f"{'Model':<20} | {'70-30 Split':<12} | {'10-Fold CV Mean':<15} | {'Difference'}")
print("-" * 65)

for name, model in models.items():
    # 1. Calculate Hold-out Accuracy (Step 3 Replication)
    model.fit(X_train, y_train)
    acc_holdout = model.score(X_test, y_test)

    # 2. Calculate 10-Fold Cross-Validation Accuracy
    # using 'cross_val_score' which automatically handles the splitting
    cv_scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    acc_cv = cv_scores.mean()

    diff = acc_cv - acc_holdout

    results.append({
        'Model': name,
        'Holdout': acc_holdout,
        'CV_Mean': acc_cv,
        'CV_Std': cv_scores.std()
    })

    print(f"{name:<20} | {acc_holdout:.4f}       | {acc_cv:.4f}          | {diff:+.4f}")

# Optional: Print Std Dev for stability analysis
print("\nStability (Standard Deviation of CV scores):")
for res in results:
    print(f"{res['Model']}: {res['CV_Std']:.4f}")

Model                | 70-30 Split  | 10-Fold CV Mean | Difference
-----------------------------------------------------------------
Decision Tree        | 0.7903       | 0.7985          | +0.0082
Naive Bayes          | 0.7893       | 0.7845          | -0.0048
kNN (k=3)            | 0.8147       | 0.8222          | +0.0075
kNN (k=5)            | 0.8213       | 0.8269          | +0.0056
kNN (k=11)           | 0.8377       | 0.8344          | -0.0033

Stability (Standard Deviation of CV scores):
Decision Tree: 0.0105
Naive Bayes: 0.0057
kNN (k=3): 0.0124
kNN (k=5): 0.0122
kNN (k=11): 0.0088
