In [1]:
import pandas as pd
import numpy as np

In [2]:
churnData = pd.read_csv("Customer-Churn.csv")
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
#kk = churnData['TotalCharges'].iloc(488)

In [6]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [7]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### Replace null values in TotalCharges column with the mean

In [8]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
#churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
features = churnData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]
features.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.5
2,2,0,53.85,108.15
3,45,0,42.3,1840.75
4,2,0,70.7,151.65


In [10]:
features.dtypes

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
TotalCharges      float64
dtype: object

In [11]:
from sklearn.preprocessing import MinMaxScaler
kimera = MinMaxScaler().fit(features)
x_normalized = kimera.transform(features)
print(x_normalized.shape)
x_normalized = pd.DataFrame(x_normalized,columns=features.columns)
x_normalized.head()

(7043, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,0.013889,0.0,0.115423,0.001275
1,0.472222,0.0,0.385075,0.215867
2,0.027778,0.0,0.354229,0.01031
3,0.625,0.0,0.239303,0.210241
4,0.027778,0.0,0.521891,0.01533


In [12]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [13]:
churnData['Churn'] = churnData['Churn'].map({'Yes':1, 'No':0})

In [14]:
churnData[['Churn']].head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [15]:
x_normalized.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
TotalCharges      0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
#SET NEW FEATURES TO THE NEWLY CREATED DATA FRAME
X = x_normalized
y = churnData['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5634, 4)
(1409, 4)
(5634,)
(1409,)


In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
X_train.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
TotalCharges      0
dtype: int64

In [20]:
y_train.isna().sum()

0

In [21]:

# classification = LogisticRegression(random_state=0, solver='saga',
#                   multi_class='multinomial').fit(X_train, y_train)

In [22]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [23]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.8062455642299503

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[959,  77],
       [196, 177]], dtype=int64)

## Data Imbalance

In [25]:
features_df = pd.concat([features, churnData['Churn']], axis = 1)
features_df.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,1,0,29.85,29.85,0
1,34,0,56.95,1889.5,0
2,2,0,53.85,108.15,1
3,45,0,42.3,1840.75,0
4,2,0,70.7,151.65,1


In [26]:
charn_cat = features_df[features_df['Churn'] == 1]
charn_cat.shape

(1869, 5)

In [27]:
no_charn_cat = features_df[features_df['Churn'] == 0]
no_charn_cat.shape

(5174, 5)

### Over Sampling

In [28]:
from sklearn.utils import resample
charn_cat_oversampled = resample(charn_cat, replace=True,  n_samples = len(no_charn_cat))
charn_cat_oversampled.shape                                  

(5174, 5)

In [29]:
charn_cat_final = pd.concat([charn_cat_oversampled,no_charn_cat], axis = 0)
print(charn_cat_final.shape)
charn_cat_final.head()

(10348, 5)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
2367,1,0,85.55,85.55,1
4444,3,0,19.85,63.75,1
2010,5,0,75.9,357.75,1
6858,25,0,89.1,2368.4,1
5333,13,0,88.35,1222.8,1


In [30]:
y_charn = charn_cat_final['Churn']
X_charn = charn_cat_final.drop('Churn', axis =1)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_charn, y_charn, test_size=0.2, random_state=1000)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [32]:
#predictions = classification.fit(X_test,y_test)
classification.score(X_test, y_test)

0.7342995169082126

## UderSampling

In [33]:
no_charn_cat_undersampled = resample(no_charn_cat, replace=False,  n_samples = len(charn_cat))
no_charn_cat_undersampled.shape  

(1869, 5)

In [34]:
no_charn_cat_final = pd.concat([no_charn_cat_undersampled,charn_cat], axis = 0)
print(no_charn_cat_final.shape)
no_charn_cat_final.head()

(3738, 5)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
3056,50,0,70.5,3486.65,0
6707,72,1,25.7,1937.4,0
5613,72,0,70.45,5165.7,0
32,27,0,66.15,1874.45,0
2050,71,0,47.6,3377.8,0


In [35]:
y_no_charn = no_charn_cat_final['Churn']
X_no_charn = no_charn_cat_final.drop('Churn', axis =1)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_no_charn, y_no_charn, test_size=0.2, random_state=1000)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [37]:
predictions = classification.fit(X_test,y_test)
classification.score(X_test, y_test)

0.732620320855615

## Model Accuracy goes down with both Undersampling and Oversampling