In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from pandas.plotting import scatter_matrix
import seaborn as sns
from IPython.display import set_matplotlib_formats, HTML
from matplotlib.dates import DateFormatter
import matplotlib_inline 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from matplotlib import colors as mcolors
from pandas.plotting import register_matplotlib_converters
import plotly.express as px
%matplotlib inline
%config InlineBackend.figure_format = 'png'
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore') 

In [21]:
# Formating Plots
# default styles
def set_sns_format(width=14, height=8):
    sns.set_theme(palette='pastel', context='notebook',rc={'savefig.dpi':300})
    matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
    matplotlib.rcParams['figure.figsize'] = (width, height)
    return None
set_sns_format(width=14, height=8)

In [20]:
def add_value_labels(ax, typ, spacing=5):
    #This function add the labels in the bar and line plots
    #input the ax to add the labels, the type of plot
    
    space = spacing
    va = 'bottom'
    

    if typ == 'bar':
        for i in ax.patches:
            y_value = i.get_height()
            x_value = i.get_x() + i.get_width() / 2

            label = "{:.0f}".format(y_value)
            ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)     

    if typ == 'line':
        for line in ax.lines:
            for x_value, y_value in zip(line.get_xdata(), line.get_ydata()):
                label = "{:.0f}".format(y_value)
                ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

Instructions
In this lab, we will first take a look at the degree of imbalance in the data and correct it using the techniques we learned on the class.
Here is the list of steps to be followed (building a simple model without balancing the data):
* Import the required libraries and modules that you would need.
* Read that data into Python and call the dataframe churnData.
* Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. Convert this * * column into numeric type using pd.to_numeric function.
* Check for null values in the dataframe. Replace the null values.
* Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:
* Scale the features either by using normalizer or a standard scaler.
* Split the data into a training set and a test set.
* Fit a logistic regression model on the training data.
* Check the accuracy on the test data.

In [2]:
df = pd.read_csv(r"C:\Users\ssai\OneDrive\Data_26-07\labs\lab-handling-data-imbalance-classification\files_for_lab/Customer-Churn.csv")

In [4]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [8]:
df["TotalCharges"].value_counts(normalize=True).sort_values(ascending=False)

           0.001562
20.2       0.001562
19.75      0.001278
20.05      0.001136
19.9       0.001136
             ...   
3886.45    0.000142
1224.05    0.000142
2310.2     0.000142
723.4      0.000142
6844.5     0.000142
Name: TotalCharges, Length: 6531, dtype: float64

In [9]:
df["TotalCharges"][df["TotalCharges"] == " "] = np.nan

In [10]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [13]:
df.isna().sum().sort_values(ascending=False)

TotalCharges        11
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
Churn                0
dtype: int64

In [14]:
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].mean())

In [15]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [16]:
X = df[["tenure", "SeniorCitizen", "MonthlyCharges", "TotalCharges"]]
y = df.iloc[:,-1]

In [17]:
y = y.apply(lambda x: 1 if x=='Yes' else 0)

In [22]:
sc = StandardScaler()

In [23]:
X = sc.fit_transform(X)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.30)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4930, 4), (4930,), (2113, 4), (2113,))

In [25]:
lr = LogisticRegression()

In [26]:
lr.fit(X_train, y_train)

In [27]:
print("train:", lr.score(X_train, y_train))
print("test:", lr.score(X_test, y_test))

train: 0.79026369168357
test: 0.7936583057264552


### Managing imbalance in the dataset

In [28]:
y.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

### TomeLinks

In [29]:
from imblearn.under_sampling import TomekLinks

In [35]:
tl = TomekLinks(sampling_strategy='auto')
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [36]:
y_train.value_counts()

0    3635
1    1295
Name: Churn, dtype: int64

In [37]:
y_train_tl.value_counts()

0    3264
1    1295
Name: Churn, dtype: int64

In [38]:
lr.fit(X_train_tl, y_train_tl)

In [39]:
print("train:", lr.score(X_train_tl, y_train_tl))
print("test:", lr.score(X_test, y_test))

train: 0.7903048914235578
test: 0.7841930903928065


### SMOTE

In [41]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)
y_train_SMOTE.value_counts()

0    3635
1    3635
Name: Churn, dtype: int64

In [42]:
lr.fit(X_train_SMOTE, y_train_SMOTE)

In [43]:
print("train:", lr.score(X_train_SMOTE, y_train_SMOTE))
print("test:", lr.score(X_test, y_test))

train: 0.7277854195323247
test: 0.7397065783246569


* Using resampling methods resulted in lower accuracy scores on the test data.