1. Load the data set

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
df=pd.read_csv("../data/customer-churn.csv")
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [2]:
df.drop(columns=["customerID"], inplace=True)

In [3]:
df["PaymentMethod"].fillna("Unknown")

0                Electronic check
1                    Mailed check
2                    Mailed check
3       Bank transfer (automatic)
4                Electronic check
                  ...            
7038                 Mailed check
7039      Credit card (automatic)
7040             Electronic check
7041                 Mailed check
7042    Bank transfer (automatic)
Name: PaymentMethod, Length: 7043, dtype: object

Preprocess the data.
1. Label encoding 
* Female:0, male:1
* yes:1, no:0
* MultipleLines- No:0, No phone service:1, yes:2
* InternetService- Fiber optic:1, DSL:2, no:0
* OnlineSecurity-  No internet service: 2
* OnlineBackup- No internet service:2
* Contract: momtMonth-to-month: 0,One year:1, Two year: 2
* PaymentMethod- Electronic check:0, Mailed check: 1, Bank transfer (automatic): 2, Other:3


In [4]:
df_encoded=df.copy()

In [5]:
df_encoded.columns.tolist()

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [6]:
df_encoded["gender"]=df_encoded["gender"].map({"Male":0, "Female":1})

In [7]:
yes_no_cols=["Partner","Dependents", "PhoneService", "PaperlessBilling","Churn"]
for column in yes_no_cols:
    df_encoded[column]=df[column].map({"No":0, "Yes":1})

In [8]:
df_encoded["MultipleLines"]=df_encoded["MultipleLines"].map({"No":0,"Yes":1,"No phone service":2})

In [9]:
nointernet_cols=["OnlineSecurity", "OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
for column in nointernet_cols:
    df_encoded[column]=df_encoded[column].map({"No":0, "Yes":1,"No internet service":2})

In [10]:
df_encoded["PaymentMethod"]=df_encoded["PaymentMethod"].map({"Electronic check":0, "Mailed check":1,"Bank transfer (automatic)":2, "Credit card (automatic)":3,"Unknown":4})

In [11]:
df_encoded["InternetService"]=df_encoded["InternetService"].map({"No":0,"Fiber optic":1, "DSL":2})

In [12]:
df_encoded["Contract"]=df_encoded["Contract"].map({"Month-to-month":0, "One year":1,"Two year":2})

Clean

In [13]:
df_encoded["TotalCharges"]=df_encoded["TotalCharges"].replace([" ",""], pd.NA)
df_encoded["TotalCharges"] = pd.to_numeric(df_encoded["TotalCharges"], errors="coerce")
df_encoded["TotalCharges"]=df_encoded["TotalCharges"].fillna(df_encoded["TotalCharges"].mean())

In [None]:
df_encoded

In [14]:
x=df_encoded.drop(columns=["Churn"])
y=df_encoded["Churn"]

In [15]:
x_train,x_test, y_train, y_test=train_test_split(x,y,random_state=442, test_size=0.2,)

In [16]:
numeric_columns=["tenure", "MonthlyCharges","TotalCharges"]
scaler=StandardScaler()
x_train[numeric_columns]=scaler.fit_transform(x_train[numeric_columns])
x_test[numeric_columns]=scaler.transform(x_test[numeric_columns])

In [17]:
model=RandomForestClassifier(class_weight="balanced",random_state=42,n_estimators=300,max_depth=15,min_samples_split=0.1,min_samples_leaf=0.1)
model.fit(x_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,15
,min_samples_split,0.1
,min_samples_leaf,0.1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
y_pred=model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[687 334]
 [ 74 314]]
              precision    recall  f1-score   support

           0       0.90      0.67      0.77      1021
           1       0.48      0.81      0.61       388

    accuracy                           0.71      1409
   macro avg       0.69      0.74      0.69      1409
weighted avg       0.79      0.71      0.73      1409

0.7104329311568488


In [19]:
print(y_pred)

[0 1 1 ... 1 1 0]
