In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns

### Load the dataset and explore the variables.

In [2]:
data = pd.read_csv('customer_churn.csv')

In [3]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [5]:
data.shape

(7043, 21)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [8]:
data['tenure'].value_counts()

1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64

In [9]:
data['SeniorCitizen'].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [10]:
data['MonthlyCharges'].value_counts()

20.05     61
19.85     45
19.95     44
19.90     44
20.00     43
          ..
23.65      1
114.70     1
43.65      1
87.80      1
78.70      1
Name: MonthlyCharges, Length: 1585, dtype: int64

### Extract the target variable.

In [11]:
y = data['Churn']
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

### Extract the independent variables and scale them.

In [12]:
cols = ['tenure', 'SeniorCitizen', 'MonthlyCharges']
X = data[cols]
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.3
4,2,0,70.7


In [13]:
scaler = StandardScaler()
cols_std = ['tenure', 'MonthlyCharges']
X.loc[:, cols_std] = scaler.fit_transform(X.loc[:, cols_std])
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, cols_std] = scaler.fit_transform(X.loc[:, cols_std])


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.277445,0,-1.160323
1,0.066327,0,-0.259629
2,-1.236724,0,-0.36266
3,0.514251,0,-0.746535
4,-1.236724,0,0.197365


### Build the logistic regression model.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [15]:
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)

In [16]:
predictions = classification.predict(X_test)

### Evaluate the model.

In [17]:
pd.Series(predictions).value_counts()

No     1700
Yes     413
dtype: int64

In [18]:
y_test.value_counts()

No     1547
Yes     566
Name: Churn, dtype: int64

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[1391,  156],
       [ 309,  257]])

In [20]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label = 'Yes')
recall = recall_score(y_test, predictions, pos_label = 'Yes')
f1 = f1_score(y_test, predictions, pos_label = 'Yes')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.7799337434926644
Precision: 0.6222760290556901
Recall: 0.4540636042402827
F1 score: 0.5250255362614913


### Even a simple model will give us more than 70% accuracy. Why?

It is possible to achieve high accuracy in cases where inherent patterns or structure ca be learned easily. Additionally, in imbalanced datasets, simply being able to predict the majority outcome may give us high accuracy, but that does not mean that the model is precise. We can see that while our accuracy is 0.78, our precision is significantly lower at 0.62. 

### Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points between existing points. Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [21]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 100)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)



In [22]:
classification_resampled = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train_resampled, y_train_resampled)

In [23]:
predictions_resampled = classification_resampled.predict(X_test)

In [24]:
accuracy = accuracy_score(y_test, predictions_resampled)
precision = precision_score(y_test, predictions_resampled, pos_label = 'Yes')
recall = recall_score(y_test, predictions_resampled, pos_label = 'Yes')
f1 = f1_score(y_test, predictions_resampled, pos_label = 'Yes')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.709891150023663
Precision: 0.4730195177956372
Recall: 0.7279151943462897
F1 score: 0.5734168406402227


We have seen an improvement in recall, which tells us the model's ability to predict correctly the number of positive instances (which were the minority variable).

The F1 score, which combines precision and recall, has improved slightly, telling us that the model is a bit better at both identifying positive instances and avoiding false positives. 

### Tomek links are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. Apply imblearn.under_sampling.TomekLinks to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [27]:
from imblearn.under_sampling import TomekLinks
tomek_links = TomekLinks()
X_train_resampled1, y_train_resampled1 = tomek_links.fit_resample(X_train, y_train)

In [28]:
classification_resampled1 = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train_resampled1, y_train_resampled1)

In [29]:
predictions_resampled1 = classification_resampled1.predict(X_test)

In [30]:
accuracy = accuracy_score(y_test, predictions_resampled1)
precision = precision_score(y_test, predictions_resampled1, pos_label = 'Yes')
recall = recall_score(y_test, predictions_resampled1, pos_label = 'Yes')
f1 = f1_score(y_test, predictions_resampled1, pos_label = 'Yes')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.7647893989588264
Precision: 0.5675146771037182
Recall: 0.5123674911660777
F1 score: 0.5385329619312906


Here we see results more in line with the original model, with higher precision but lower recall. Even though the accuracy is higher on the original model and on the model where we resampled using Tomek Links, in this case where what matters most is being able to adequately predict customer churn ('Yes'), I would take the model where we applied SMOTE, as it's the one with the highest recall, which means that it can better predict the number of positive instances. 