In [32]:
import pandas as pd
import numpy as np
 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

# 1. Load the dataset and explore the variables.

In [2]:
data_0 = pd.read_csv('customer_churn.csv')

In [3]:
data_0.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
data_0[['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup']]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No
...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No


In [5]:
data_0[['DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']]

Unnamed: 0,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...
7038,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


# 2. We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen, MonthlyCharges.

In [6]:
data_1 = data_0[['Churn', 'tenure', 'SeniorCitizen', 'MonthlyCharges']]
data_1 = data_1.rename(columns={'Churn':'churn',
                            'SeniorCitizen':'senior_citizen', 
                            'MonthlyCharges':'monthly_charges'})
data_1

Unnamed: 0,churn,tenure,senior_citizen,monthly_charges
0,No,1,0,29.85
1,No,34,0,56.95
2,Yes,2,0,53.85
3,No,45,0,42.30
4,Yes,2,0,70.70
...,...,...,...,...
7038,No,24,0,84.80
7039,No,72,0,103.20
7040,No,11,0,29.60
7041,Yes,4,1,74.40


In [7]:
data_1['churn'].unique()

array(['No', 'Yes'], dtype=object)

In [8]:
data_1['tenure'].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39], dtype=int64)

In [9]:
data_1['senior_citizen'].unique()

array([0, 1], dtype=int64)

In [10]:
data_1['monthly_charges'].unique()

array([29.85, 56.95, 53.85, ..., 63.1 , 44.2 , 78.7 ])

In [11]:
data_1['monthly_charges'].dtype

dtype('float64')

# 3. Split the Dataset into X ('tenure', 'SeniorCitizen', 'MonthlyCharges') and y ('Churn')

In [12]:
y = data_1['churn']
X = data_1.drop('churn', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. Build the logistic regression model.

##### Treat categorical

In [13]:
# Create encoder

encoder = OneHotEncoder(drop='first')

In [14]:
# Encode

y_train_encoded = encoder.fit_transform(y_train.to_numpy().reshape(-1, 1))

In [15]:
# To array

y_train_array = y_train_encoded.toarray()

##### Treat numerical

In [16]:
# Define numerical

X_train_num = X_train.select_dtypes(include = np.number)

In [17]:
# Fit transformer

num_transformer = MinMaxScaler().fit(X_train_num)

In [18]:
# Normalize numerical

X_train_num_transformed_array = num_transformer.transform(X_train_num)

In [19]:
# Convert transform array to dataframe

X_train_treated = pd.DataFrame(X_train_num_transformed_array, columns = X_train_num.columns)
X_train_treated

Unnamed: 0,tenure,senior_citizen,monthly_charges
0,1.000000,0.0,0.665174
1,0.194444,1.0,0.279602
2,0.986111,0.0,0.064179
3,0.458333,0.0,0.553731
4,0.652778,0.0,0.800995
...,...,...,...
5629,0.208333,0.0,0.847761
5630,0.138889,0.0,0.724876
5631,0.805556,0.0,0.024876
5632,0.013889,1.0,0.512438


##### Create a function to clean treat other df

In [20]:
def treat(df):
    df_num = df.select_dtypes(include = np.number)
    df_num_transformed_array = num_transformer.transform(df_num)
    df_num_treated = pd.DataFrame(df_num_transformed_array, columns = df_num.columns)
    df_treated = df_num_treated
    return df_treated

##### Logistic Regression

In [21]:
classification = LogisticRegression(random_state= 0, solver = 'saga',
                  multi_class = 'auto').fit(X_train_treated, y_train_array)

  y = column_or_1d(y, warn=True)


In [22]:
y_train_reshaped = y_train_array.ravel()

In [23]:
classification = LogisticRegression(random_state= 0, solver = 'saga',
                  multi_class = 'auto').fit(X_train_treated, y_train_reshaped)

# 5. Evaluate the model.

In [24]:
X_test_treated = treat(X_test)

In [25]:
predictions = classification.predict(X_test_treated)
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [26]:
y_test_encoded = encoder.fit_transform(y_test.to_numpy().reshape(-1, 1))
y_test_array = y_test_encoded.toarray()
y_test_array

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]])

In [27]:
classification.score(X_test_treated, y_test_array)

0.7849538679914834

In [31]:
confusion_matrix(y_test_array, predictions)

array([[937, 104],
       [199, 169]], dtype=int64)

There are 937 of 1041 predicted as "No". However, there only 169 of 368 correctly predicted as "Yes".

# 6. Even a simple model will give us more than 70% accuracy. Why?

Because there is data imbalance and there are more number of No than Yes

In [30]:
y_test.value_counts()

No     1041
Yes     368
Name: churn, dtype: int64

# 7. Apply imblearn.over_sampling.SMOTE to the dataset. 

Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points between existing points. 
Build and evaluate the logistic regression model. Is it there any improvement?

In [33]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_treated,y_train_reshaped)

In [34]:
classification = LogisticRegression(random_state= 0, solver = 'saga',
                  multi_class = 'auto').fit(X_train_SMOTE, y_train_SMOTE)

In [35]:
predictions = classification.predict(X_test_treated)
predictions

array([0., 0., 1., ..., 1., 0., 1.])

In [36]:
confusion_matrix(y_test_array, predictions)

array([[751, 290],
       [107, 261]], dtype=int64)

Well, now there are 261 of 368 correctly predicted as "Yes", but 751 of 1041 predicted as "No". So as we care more about predicting correctly "Yes" churns, we can say it has improved.