In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from pandas.plotting import scatter_matrix
import seaborn as sns
from IPython.display import set_matplotlib_formats, HTML
from matplotlib.dates import DateFormatter
import matplotlib_inline 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from matplotlib import colors as mcolors
from pandas.plotting import register_matplotlib_converters
import plotly.express as px
%matplotlib inline
%config InlineBackend.figure_format = 'png'
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore') 

In [2]:
# Formating Plots
# default styles
def set_sns_format(width=14, height=8):
    sns.set_theme(palette='pastel', context='notebook',rc={'savefig.dpi':300})
    matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
    matplotlib.rcParams['figure.figsize'] = (width, height)
    return None
set_sns_format(width=14, height=8)

In [None]:
def add_value_labels(ax, typ, spacing=5):
    #This function add the labels in the bar and line plots
    #input the ax to add the labels, the type of plot
    
    space = spacing
    va = 'bottom'
    

    if typ == 'bar':
        for i in ax.patches:
            y_value = i.get_height()
            x_value = i.get_x() + i.get_width() / 2

            label = "{:.2f}".format(y_value)
            ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)     

    if typ == 'line':
        for line in ax.lines:
            for x_value, y_value in zip(line.get_xdata(), line.get_ydata()):
                label = "{:.2f}".format(y_value)
                ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)

## Load the dataset

In [3]:
df = pd.read_csv(r"C:\Users\ssai\OneDrive\Data_26-07\labs\lab-imbalanced-data\files_for_lab/customer_churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## Target Variable

In [4]:
# Separation X and y 
X = df.loc[:, ["tenure", "SeniorCitizen", "MonthlyCharges"]] 
y = df["Churn"]                    

In [5]:
# train logistic model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0) # default 20/80 split on test
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression Model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

model = LogisticRegression(multi_class = "ovr", max_iter=1000 )
model.fit(X_train, y_train)

## Model Results

In [7]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

          No       0.82      0.91      0.87      3876
         Yes       0.65      0.47      0.54      1406

    accuracy                           0.79      5282
   macro avg       0.74      0.69      0.71      5282
weighted avg       0.78      0.79      0.78      5282

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1298
         Yes       0.61      0.46      0.52       463

    accuracy                           0.78      1761
   macro avg       0.72      0.68      0.69      1761
weighted avg       0.77      0.78      0.77      1761



In [8]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

### Model is imbalanced

## SMOTE

In [9]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)
y_train_SMOTE.value_counts(normalize = True)

No     0.5
Yes    0.5
Name: Churn, dtype: float64

In [10]:
print(X_train.shape)
print(X_train_SMOTE.shape) # created synthetic data

(5282, 3)
(7752, 3)


In [11]:
model.fit(X_train_SMOTE, y_train_SMOTE)

pred_train_SMOTE = model.predict(X_train_SMOTE)
pred_test_SMOTE = model.predict(X_test)

print(classification_report(y_train_SMOTE, pred_train_SMOTE))
print(classification_report(y_test, pred_test_SMOTE))

              precision    recall  f1-score   support

          No       0.75      0.74      0.74      3876
         Yes       0.74      0.75      0.75      3876

    accuracy                           0.74      7752
   macro avg       0.74      0.74      0.74      7752
weighted avg       0.74      0.74      0.74      7752

              precision    recall  f1-score   support

          No       0.88      0.72      0.79      1298
         Yes       0.48      0.71      0.57       463

    accuracy                           0.72      1761
   macro avg       0.68      0.72      0.68      1761
weighted avg       0.77      0.72      0.74      1761



## Tomek Links

In [12]:
# TomekLink
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority') # play around with sampling_strategy_
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [13]:
model.fit(X_train_tl, y_train_tl)

pred_train_tl = model.predict(X_train_tl)
pred_test_tl = model.predict(X_test)

print(classification_report(y_test, pred_test_tl))
print(classification_report(y_train_tl, pred_train_tl))

              precision    recall  f1-score   support

          No       0.83      0.85      0.84      1298
         Yes       0.56      0.52      0.54       463

    accuracy                           0.77      1761
   macro avg       0.70      0.69      0.69      1761
weighted avg       0.76      0.77      0.76      1761

              precision    recall  f1-score   support

          No       0.83      0.90      0.86      3516
         Yes       0.68      0.53      0.60      1406

    accuracy                           0.79      4922
   macro avg       0.75      0.72      0.73      4922
weighted avg       0.79      0.79      0.79      4922

