In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

file_name = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
try:
    df = pd.read_csv(file_name)
    print(f"✅ Data loaded successfully. Initial shape: {df.shape}")
except FileNotFoundError:
    print(f"❌ Error: The file '{file_name}' was not found. Please ensure it is in the same directory.")
    df = None 

if df is not None:

    df.drop('customerID', axis=1, inplace=True)
    
  
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    median_charges = df['TotalCharges'].median()
    df['TotalCharges'].fillna(median_charges, inplace=True)
    

    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    

    cols_to_simplify = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                        'TechSupport', 'StreamingTV', 'StreamingMovies']
    for col in cols_to_simplify:
        df[col] = df[col].replace({'No internet service': 'No', 'No phone service': 'No'})

    le = LabelEncoder()
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() == 2:
            df[col] = le.fit_transform(df[col])

    df_encoded = pd.get_dummies(df, drop_first=True) 
    print(f"Data cleaned and encoded. Final shape: {df_encoded.shape}")


    X = df_encoded.drop('Churn', axis=1)
    y = df_encoded['Churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                        random_state=42, stratify=y) 

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    print("Data split and scaled successfully.")


    models = {
        "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5) 
    }

    results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}
    model_preds = {}

    print("\n=============================================")
    print("      MODEL TRAINING & EVALUATION")
    print("=============================================")

    for name, model in models.items():
        
     
        if name in ["Logistic Regression", "KNN"]:
            X_train_data = X_train_scaled_df
            X_test_data = X_test_scaled_df
        else:
            X_train_data = X_train
            X_test_data = X_test


        model.fit(X_train_data, y_train)
        y_pred = model.predict(X_test_data)
        model_preds[name] = y_pred


        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        results['Model'].append(name)
        results['Accuracy'].append(acc)
        results['Precision'].append(prec)
        results['Recall'].append(rec)
        results['F1-Score'].append(f1)
        
        print(f"✅ {name} finished. F1-Score: {f1:.4f}")

    
    results_df = pd.DataFrame(results).sort_values(by='F1-Score', ascending=False)
    print("\n--- Final Model Comparison Table (Sorted by F1-Score) ---")
    print(results_df.to_markdown(index=False))

    best_model_name = results_df.iloc[0]['Model']
    best_y_pred = model_preds[best_model_name]
    
    print(f"\n--- Confusion Matrix for Best Model ({best_model_name}) ---")
    print(confusion_matrix(y_test, best_y_pred))


✅ Data loaded successfully. Initial shape: (7043, 21)
Data cleaned and encoded. Final shape: (7043, 24)
Data split and scaled successfully.

      MODEL TRAINING & EVALUATION
✅ Logistic Regression finished. F1-Score: 0.6106
✅ Decision Tree finished. F1-Score: 0.5031
✅ Random Forest finished. F1-Score: 0.5422
✅ KNN finished. F1-Score: 0.5251

--- Final Model Comparison Table (Sorted by F1-Score) ---
| Model               |   Accuracy |   Precision |   Recall |   F1-Score |
|:--------------------|-----------:|------------:|---------:|-----------:|
| Logistic Regression |   0.809276 |    0.666667 | 0.56328  |   0.610628 |
| Random Forest       |   0.784193 |    0.62069  | 0.481283 |   0.542169 |
| KNN                 |   0.749172 |    0.527928 | 0.522282 |   0.52509  |
| Decision Tree       |   0.732608 |    0.496528 | 0.509804 |   0.503078 |

--- Confusion Matrix for Best Model (Logistic Regression) ---
[[1394  158]
 [ 245  316]]
