In [110]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np


In [78]:
df=pd.read_csv('telco_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,7590-VHVEG,Female,False,True,False,1,False,,DSL,...,False,False,False,False,Month-to-month,True,Electronic check,29.85,29.850000381469727,False
1,1,5575-GNVDE,Male,False,False,False,34,True,False,DSL,...,True,False,False,False,One year,False,Mailed check,56.950001,1889.5,False
2,2,3668-QPYBK,Male,False,False,False,2,True,False,DSL,...,False,False,False,False,Month-to-month,True,Mailed check,53.849998,108.1500015258789,True
3,3,7795-CFOCW,Male,False,False,False,45,False,,DSL,...,True,True,False,False,One year,False,Bank transfer (automatic),42.299999,1840.75,False
4,4,9237-HQITU,Female,False,False,False,2,True,False,Fiber optic,...,False,False,False,False,Month-to-month,True,Electronic check,70.699997,151.64999389648438,True


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5043 non-null   int64  
 1   customerID        5043 non-null   object 
 2   gender            5043 non-null   object 
 3   SeniorCitizen     5043 non-null   object 
 4   Partner           5043 non-null   object 
 5   Dependents        5043 non-null   object 
 6   tenure            5043 non-null   int64  
 7   PhoneService      5043 non-null   object 
 8   MultipleLines     4774 non-null   object 
 9   InternetService   5043 non-null   object 
 10  OnlineSecurity    4392 non-null   object 
 11  OnlineBackup      4392 non-null   object 
 12  DeviceProtection  4392 non-null   object 
 13  TechSupport       4392 non-null   object 
 14  StreamingTV       4392 non-null   object 
 15  StreamingMovies   4392 non-null   object 
 16  Contract          5043 non-null   object 


In [80]:
df.describe()

Unnamed: 0.1,Unnamed: 0,tenure,MonthlyCharges
count,5043.0,5043.0,5043.0
mean,1305.651993,32.576641,65.090214
std,801.484415,24.529807,30.068133
min,0.0,0.0,18.4
25%,630.0,9.0,35.775
50%,1260.0,29.0,70.550003
75%,1890.5,56.0,90.050003
max,2999.0,72.0,118.650002


In [81]:
df['Churn'].value_counts()

Churn
False    2219
No       1487
True      780
Yes       556
Name: count, dtype: int64

In [82]:
import numpy as np

# Step 1: Clean the values
df['Churn'] = df['Churn'].astype(str).str.strip().str.lower()

# Step 2: Replace known variations with standardized values
df['Churn'] = df['Churn'].replace({
    'true': 'yes',
    '1': 'yes',
    '2': 'yes',
    'false': 'no',
    '0': 'no',
    'nan': np.nan,      
    '': np.nan,         
    'unknown': np.nan   
})

# Step 3: Drop rows where Churn is still nan
df = df.dropna(subset=['Churn'])

# Optional: Check final unique values (should only be 'yes' and 'no')
print(df['Churn'].unique())

['no' 'yes']


In [83]:
df['Churn'].value_counts()

Churn
no     3706
yes    1336
Name: count, dtype: int64

In [84]:
df.shape

(5042, 22)

In [85]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges':df['TotalCharges'].median()}, inplace=True)

In [86]:
#ENCODE THE CATOGORICAL FEATURE
label={}
for col in df.columns:
    if df[col].dtype=='object':
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
        label[col]=le

In [87]:
# Scale the numerical feature
sandard=StandardScaler()
numerical_features=['tenure','MonthlyCharges','TotalCharges']
df[numerical_features]=sandard.fit_transform(df[numerical_features])

In [88]:
#feature and target
x=df.drop(columns=['Churn'])
y=df['Churn']

In [89]:
#split dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [90]:
#train the model
rf=RandomForestClassifier(random_state=42)
rf.fit(x_train,y_train)

y_pred=rf.predict(x_test)
print('accuracy_score',accuracy_score(y_test,y_pred))

accuracy_score 0.821605550049554


In [101]:
#hyperparameter tuning
param_dist={
    'n_estimators':np.arange(50,200,10),
    'max_depth':[None,5,10,15],
    'min_samples_split':[2,5,10,20],
    'min_samples_leaf':[1,2,3]
}

In [102]:
random_search=RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    scoring='accuracy',
    n_iter=20,
    cv=5,
    n_jobs=-1,
    param_distributions=param_dist
)

In [108]:
random_search.fit(x_train,y_train)
y_pred=random_search.predict(x_test)
best_model=random_search.best_estimator_

In [105]:
# train the model with best parameters
print('best model',random_search.best_estimator_)
print('best parameter',random_search.best_params_)

best model RandomForestClassifier(min_samples_split=10, n_estimators=70, random_state=42)
best parameter {'n_estimators': 70, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}


In [106]:
print('best_score:',random_search.best_score_)

best_score: 0.7979207984724234


In [107]:
print('accuracy_score',accuracy_score(y_test,y_pred))

accuracy_score 0.817641228939544


In [111]:
# evaluate cross val score
cv_scores=cross_val_score(best_model,x,y,cv=5,scoring='accuracy')

In [113]:
print('cross_val_score',cv_scores)
print('mean cross_val_score',cv_scores.mean())

cross_val_score [0.80971259 0.7938553  0.79166667 0.78472222 0.80257937]
mean cross_val_score 0.7965072285934526
