In [2]:
# Data Analysis and Processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Model Related
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("Telco-Customer-Churn.csv")

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Processing

In [5]:
# Dropping the customer ID column
df = df.drop('customerID', axis = 1)

In [6]:
# Converting TotalCharges to numeric
df[df['TotalCharges'] == " "] = 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [7]:
# Checking categorical variable values
cat_col = df.drop(['MonthlyCharges', 'TotalCharges', 'tenure'], axis = 1).columns

In [8]:
# Dropping columns with all 0
df = df.drop(df[df['gender'] == 0].index)

In [9]:
# Converting 'Yes' and 'No' to 1 and 0
binary_col = ["Partner", 
              "Dependents", 
              "PhoneService",
              "MultipleLines",
              "OnlineSecurity", 
              "OnlineBackup", 
              "DeviceProtection", 
              "TechSupport",
              "StreamingTV",
              "StreamingMovies",
              "PaperlessBilling",
              "Churn"]

df[binary_col] = df[binary_col].replace(to_replace=['No', 'Yes', 'No phone service', 'No internet service'], 
                                        value=[0, 1, 0, 0])

In [10]:
# Converting remaining categorical variables (gender, InternetService, Contract, PaymentMethod)
# Scale variable if ordinal
# Use dummy variables if nominal

nominal_col = ["gender",
               "InternetService",
               "Contract",
               "PaymentMethod"]

# Creating the dummy columns
gender_dummy = pd.get_dummies(df['gender'])
InternetService_dummy = pd.get_dummies(df['InternetService'])
Contract_dummy = pd.get_dummies(df['Contract'])
PaymentMethod_dummy = pd.get_dummies(df['PaymentMethod'])

# Dropping the original columns
df = df.drop(nominal_col, axis = 1)

# Concatenating dummy columns
df = pd.concat([df, gender_dummy], axis=1)
df = pd.concat([df, InternetService_dummy], axis=1)
df = pd.concat([df, Contract_dummy], axis=1)
df = pd.concat([df, PaymentMethod_dummy], axis=1)

In [11]:
# Saving the column names
col_names = df.columns

# Now we scale the remaining values
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df)
df = pd.DataFrame(scaled, columns=col_names)

In [12]:
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,DSL,Fiber optic,No,Month-to-month,One year,Two year,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.464789,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.014085,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.619718,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.014085,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Creating a train test split for the model

In [13]:
X = df.iloc[:, df.columns != 'Churn']
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 101)

## Random Forest

In [14]:
# Defining the model
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
# Training the model
clf.fit(X_train, y_train)
# Making predictions
preds = clf.predict(X_test)
# Calculating accuracy
accuracy_score(y_test, preds)

0.7957906712172924

### Using all features, we get an accuracy of 79%

## Feature selection - Gini Impurity