## Instructions

**Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.**
Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [24]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")


from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

In [2]:
churnData = pd.read_csv("Customer-Churn.csv")
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
churnData.columns = churnData.columns.str.lower()
churnData.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [4]:
#Checking the data types

churnData.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [5]:
churnData['totalcharges'].value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: totalcharges, Length: 6531, dtype: int64

In [6]:
#let's replace the space with the mean value of TotalCharges

mean_value = pd.to_numeric(churnData['totalcharges'], errors='coerce').mean()
churnData['totalcharges'] = churnData['totalcharges'].replace(' ', mean_value)

In [7]:
churnData['totalcharges'].value_counts() # so the space is converted to mean of the col

2283.3004408418656    11
20.2                  11
19.75                  9
20.05                  8
19.9                   8
                      ..
6849.4                 1
692.35                 1
130.15                 1
3211.9                 1
6844.5                 1
Name: totalcharges, Length: 6531, dtype: int64

In [8]:
churnData['totalcharges'] = pd.to_numeric(churnData['totalcharges'])
churnData.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

### Dummifying the target

In [9]:
churnData['churn'] = churnData['churn'].apply(lambda x: 1 if x == "Yes" else 0)

In [10]:
y= churnData[['churn']]
y.head()

Unnamed: 0,churn
0,0
1,0
2,1
3,0
4,1


### Encode the Categorical Variables

In [11]:
#Numerical Variables

numericals = churnData.select_dtypes(np.number)
numericals

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
0,0,1,29.85,29.85,0
1,0,34,56.95,1889.50,0
2,0,2,53.85,108.15,1
3,0,45,42.30,1840.75,0
4,0,2,70.70,151.65,1
...,...,...,...,...,...
7038,0,24,84.80,1990.50,0
7039,0,72,103.20,7362.90,0
7040,0,11,29.60,346.45,0
7041,1,4,74.40,306.60,1


In [12]:
categoricals = churnData.select_dtypes(object)
categoricals

Unnamed: 0,gender,partner,dependents,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month
...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month


In [13]:

cat_dumm = pd.get_dummies(categoricals).astype(int) 
cat_dumm

Unnamed: 0,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,phoneservice_No,phoneservice_Yes,onlinesecurity_No,onlinesecurity_No internet service,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,1,0,0,1,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
3,0,1,1,0,1,0,1,0,0,0,...,1,1,0,0,1,0,0,0,1,0
4,1,0,1,0,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
7039,1,0,0,1,0,1,0,1,1,0,...,0,0,0,1,0,0,1,0,1,0
7040,1,0,0,1,0,1,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
7041,0,1,0,1,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0


In [14]:
X = numericals.join(cat_dumm)
X

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.50,0,0,1,1,0,1,...,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,0
3,0,45,42.30,1840.75,0,0,1,1,0,1,...,1,1,0,0,1,0,0,0,1,0
4,0,2,70.70,151.65,1,1,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,0,1,0,1,0,...,1,0,0,1,0,0,1,0,1,0
7039,0,72,103.20,7362.90,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,1,0
7040,0,11,29.60,346.45,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,1,0,1,0,1,1,...,0,1,0,0,1,0,0,1,0,0


In [15]:
#dropping the target (churn)

X.drop('churn', axis=1, inplace=True)
X

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,1,29.85,29.85,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.50,0,1,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
3,0,45,42.30,1840.75,0,1,1,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,0,2,70.70,151.65,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,1,0,1,0,1,...,1,0,0,1,0,0,1,0,1,0
7039,0,72,103.20,7362.90,1,0,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0
7040,0,11,29.60,346.45,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0


In [16]:
y = churnData['churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

### SMOTE

In [17]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

In [18]:
y_smote.value_counts()

0    5174
1    5174
Name: churn, dtype: int64

### Train-Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, random_state=1, test_size=0.3)

### Random Forest Model

In [20]:
rf =RandomForestClassifier(criterion='gini', max_depth=None, random_state=42)
rf.fit(X_train, y_train)


In [21]:
predictions = rf.predict(X_test)

In [22]:
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8492753623188406


In [25]:
report=classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.86      0.85      1582
           1       0.85      0.84      0.85      1523

    accuracy                           0.85      3105
   macro avg       0.85      0.85      0.85      3105
weighted avg       0.85      0.85      0.85      3105



Model performs very well, it can classify  85% of the observations are correctly. It is also able to classify both classes relatively well.