<a href="https://colab.research.google.com/github/kizons/DS/blob/main/Customer_churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


In [None]:
# ---- Step 1: Load and prepare data ----
# Replace `your_dataframe` with your actual dataset
df = pd.read_csv('train.csv')

# One-hot encode categorical columns
categorical_cols = ['state', 'area_code', 'international_plan', 'voice_mail_plan']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=df.index)

# Merge and drop
df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)


In [None]:
# ---- Step 2: Split into train, validation, test (60/20/20) ----
temp_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train_df, valid_df = train_test_split(temp_df, test_size=0.25, random_state=42, shuffle=True) # 0.25 of 0.8 = 0.2


In [None]:
# ---- Step 3: Scale and oversample (only on training data) ----
def preprocess(dataframe, oversample=False):
  X = dataframe.drop(columns='churn').values
  y = dataframe['churn'].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    sm = SMOTE(random_state=42)
    X, y = sm.fit_resample(X, y)

  return X, y



In [None]:
X_train, y_train = preprocess(train_df, oversample=True)
X_valid, y_valid = preprocess(valid_df, oversample=False)
X_test, y_test = preprocess(test_df, oversample=False)


In [None]:
# ---- Step 4: Train and evaluate models ----

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN Results:\n", classification_report(y_test, y_pred_knn))

KNN Results:
               precision    recall  f1-score   support

          no       0.89      0.68      0.77       721
         yes       0.23      0.53      0.32       129

    accuracy                           0.66       850
   macro avg       0.56      0.61      0.55       850
weighted avg       0.79      0.66      0.70       850



In [None]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Results:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Results:
               precision    recall  f1-score   support

          no       0.88      0.46      0.60       721
         yes       0.17      0.64      0.27       129

    accuracy                           0.49       850
   macro avg       0.53      0.55      0.44       850
weighted avg       0.77      0.49      0.55       850



In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("Logistic Regression Results:\n", classification_report(y_test, y_pred_log))

Logistic Regression Results:
               precision    recall  f1-score   support

          no       0.93      0.80      0.86       721
         yes       0.38      0.68      0.49       129

    accuracy                           0.78       850
   macro avg       0.66      0.74      0.68       850
weighted avg       0.85      0.78      0.81       850



Out of the 3 models, Logistic Regression Model gave the best result