In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### **Vibe Coding**

In [14]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Converting 'TotalCharges' to numeric, handling errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Identifying categorical and numerical columns (excluding customerID and Churn)
categorical_cols = df.select_dtypes(include=['object']).columns.drop(['customerID', 'Churn']).tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Handling missing values in numerical columns (TotalCharges)
num_imputer = SimpleImputer(strategy='median')
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

# Handling missing values in categorical columns
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# One-hot encoding categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cat_df = pd.DataFrame(
    encoded_cats,
    columns=encoder.get_feature_names_out(categorical_cols)
)

# Dropping original categorical columns and concatenating encoded ones
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, encoded_cat_df], axis=1)

# Splitting features and target
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

### **kNN Classification**

In [15]:
# Training kNN classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)

# Printing classification report
print("\nkNN Classifier Classification Report:")
print(classification_report(y_test, y_pred))


kNN Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1282
           1       0.57      0.54      0.56       479

    accuracy                           0.77      1761
   macro avg       0.70      0.70      0.70      1761
weighted avg       0.76      0.77      0.76      1761



In [16]:
# Printing classification report
print("\nkNN Classifier Classification Accuracy:")
print(accuracy_score(y_test, y_pred)*100)


kNN Classifier Classification Accuracy:
76.5474162407723


### **Decision Tree Classification**

In [17]:
from sklearn.tree import DecisionTreeClassifier

# Training kNN classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Making predictions
y_pred_dt = dt.predict(X_test)

# Printing classification report
print("\nDT Classifier Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Printing classification report
print("\nDT Classifier Classification Accuracy:")
print(accuracy_score(y_test, y_pred_dt)*100)


DT Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.81      1282
           1       0.50      0.49      0.50       479

    accuracy                           0.73      1761
   macro avg       0.66      0.66      0.66      1761
weighted avg       0.73      0.73      0.73      1761


DT Classifier Classification Accuracy:
72.91311754684838


### **Random Forest Classification**

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Training Random Forest classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)

# Making predictions
y_pred_rf = rf.predict(X_test)

# Printing classification report
print("\nRandom Forest Classifier Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1282
           1       0.66      0.47      0.54       479

    accuracy                           0.79      1761
   macro avg       0.74      0.69      0.70      1761
weighted avg       0.78      0.79      0.78      1761



In [20]:
# Printing classification report
print("\nRF Classifier Classification Accuracy:")
print(accuracy_score(y_test, y_pred_dt)*100)


RF Classifier Classification Accuracy:
72.91311754684838


### **Ensemble Classification**

In [22]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [23]:
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [25]:
# Training AdaBoost classifier
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train, y_train)

# Making predictions
y_pred = ada.predict(X_test)

# Printing classification report
print("\nAdaBoost Classifier Classification Report:")
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))


AdaBoost Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1282
           1       0.68      0.53      0.60       479

    accuracy                           0.80      1761
   macro avg       0.76      0.72      0.73      1761
weighted avg       0.80      0.80      0.80      1761

0.80465644520159


In [26]:
# Training CatBoost classifier
cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(X_train, y_train)

# Making predictions
y_pred = cat.predict(X_test)

# Printing classification report
print("\nCatBoost Classifier Classification Report:")
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))


CatBoost Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1282
           1       0.65      0.50      0.57       479

    accuracy                           0.79      1761
   macro avg       0.74      0.70      0.71      1761
weighted avg       0.78      0.79      0.78      1761

0.7921635434412265


In [27]:
# Training XGBoost classifier
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)

# Making predictions
y_pred = xgb.predict(X_test)

# Printing classification report
print("\nXGBoost Classifier Classification Report:")
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))


XGBoost Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1282
           1       0.64      0.52      0.58       479

    accuracy                           0.79      1761
   macro avg       0.74      0.71      0.72      1761
weighted avg       0.78      0.79      0.78      1761

0.7910278250993753
