### Import the required libraries and modules that you would need.

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

### Read that data into Python and call the dataframe `churnData`.

In [2]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


### Check the datatypes of all the columns in the data.

In [3]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

### The column `TotalCharges` is object type. Convert this column into numeric type using `pd.to_numeric` function.

In [4]:
churnData['TotalCharges'].value_counts()

20.2       11
           11
19.75       9
20.05       8
19.9        8
           ..
4719.75     1
46.2        1
5264.3      1
1215.6      1
1311.75     1
Name: TotalCharges, Length: 6531, dtype: int64

In [5]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors = 'coerce')

In [6]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

### Check for null values in the dataframe. Replace the null values.

In [7]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.ceil(np.mean(churnData['TotalCharges'])))

In [9]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Use the following features: `tenure`, `SeniorCitizen`, `MonthlyCharges` and `TotalCharges`:  

- **Scale the features either by using normalizer or a standard scaler.**

In [10]:
churnData['Churn'].value_counts(dropna = False)

No     5174
Yes    1869
Name: Churn, dtype: int64

In [11]:
def boolean_indicating(x):
    if x == 'Yes':
        return 1
    else:
        return 0
churnData['Churn']= churnData['Churn'].apply(boolean_indicating)

churnData['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [12]:
features = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]

In [13]:
features.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.5
2,2,0,53.85,108.15
3,45,0,42.3,1840.75
4,2,0,70.7,151.65


In [14]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(features)
X_train_scaled = pd.DataFrame(transformer.transform(features),columns=features.columns)

# X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-1.277445,-0.439916,-1.160323,-0.994972
1,0.066327,-0.439916,-0.259629,-0.173876
2,-1.236724,-0.439916,-0.36266,-0.9604
3,0.514251,-0.439916,-0.746535,-0.195401
4,-1.236724,-0.439916,0.197365,-0.941193


In [15]:
X = X_train_scaled
y = churnData['Churn']

- **Split the data into a training set and a test set.**

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- **Fit a logistic regression model on the training data.**

In [17]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=42, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=42)

- **Check the accuracy on the test data.**

In [18]:
LR.score(X_test, y_test)

0.8076650106458482

In [21]:
pred = LR.predict(X_test)

In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[959,  77],
       [194, 179]], dtype=int64)

In [23]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.69921875
recall:  0.47989276139410186
f1:  0.5691573926868044


## Managing imbalance in the dataset   
### Check for the imbalance.

In [24]:
churnData['Churn'].value_counts(dropna = False)

0    5174
1    1869
Name: Churn, dtype: int64

### Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

In [25]:
data = pd.concat([X, y], axis = 1)
data

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,-1.277445,-0.439916,-1.160323,-0.994972,0
1,0.066327,-0.439916,-0.259629,-0.173876,0
2,-1.236724,-0.439916,-0.362660,-0.960400,1
3,0.514251,-0.439916,-0.746535,-0.195401,0
4,-1.236724,-0.439916,0.197365,-0.941193,1
...,...,...,...,...,...
7038,-0.340876,-0.439916,0.665992,-0.129281,0
7039,1.613701,-0.439916,1.277533,2.242807,0
7040,-0.870241,-0.439916,-1.168632,-0.855183,0
7041,-1.155283,2.273159,0.320338,-0.872778,1


In [27]:
from sklearn.utils import resample

category_0 = data[data['Churn'] == 0]
category_1 = data[data['Churn'] == 1]

### Upsampling (oversampling)

In [28]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [29]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5174, 5)
(5174, 5)


In [30]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [31]:
data_upsampled['Churn'].value_counts()

1    5174
0    5174
Name: Churn, dtype: int64

In [32]:
data_upsampled

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,-1.277445,-0.439916,-1.160323,-0.994972,0
1,0.066327,-0.439916,-0.259629,-0.173876,0
3,0.514251,-0.439916,-0.746535,-0.195401,0
6,-0.422317,-0.439916,0.808907,-0.147428,0
7,-0.910961,-0.439916,-1.163647,-0.874853,0
...,...,...,...,...,...
6858,-0.300156,-0.439916,0.808907,0.037574,1
4734,0.269929,-0.439916,0.506459,0.397224,1
2008,-1.155283,-0.439916,0.742435,-0.857390,1
223,-1.155283,-0.439916,-0.488957,-0.928963,1


In [33]:
y_upsampled = data_upsampled['Churn']
X_upsampled = data_upsampled.drop('Churn',axis = 1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

In [36]:
LR = LogisticRegression(random_state=42, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [37]:
LR.score(X_test, y_test)

0.7371980676328502

In [38]:
pred2 = LR.predict(X_test)

In [39]:
confusion_matrix(y_test,pred2)

array([[757, 287],
       [257, 769]], dtype=int64)

In [40]:
print("precision: ",precision_score(y_test,pred2))
print("recall: ",recall_score(y_test,pred2))
print("f1: ",f1_score(y_test,pred2))

precision:  0.728219696969697
recall:  0.7495126705653021
f1:  0.7387127761767531


### Downsampling (undersampling)

In [41]:
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))

In [42]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


In [43]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [44]:
data_downsampled['Churn'].value_counts()

1    1869
0    1869
Name: Churn, dtype: int64

In [45]:
y_downsampled = data_downsampled['Churn']
X_downsampled = data_downsampled.drop('Churn',axis = 1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_downsampled, y_downsampled, test_size=0.2, random_state=42)

In [47]:
LR = LogisticRegression(random_state=42, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [48]:
LR.score(X_test, y_test)

0.7312834224598931

In [49]:
pred3 = LR.predict(X_test)

In [50]:
confusion_matrix(y_test,pred3)

array([[272, 107],
       [ 94, 275]], dtype=int64)

In [51]:
print("precision: ",precision_score(y_test,pred3))
print("recall: ",recall_score(y_test,pred3))
print("f1: ",f1_score(y_test,pred3))

precision:  0.7198952879581152
recall:  0.7452574525745257
f1:  0.7323568575233023
