## Imports - Libraries & Data


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.svm import SVC

In [3]:
# import dataset
df = pd.read_csv('https://raw.githubusercontent.com/lijjumathew/MSDS-Machine-Learning-1-Project/master/dataset/Telco-Customer-Churn.csv')

In [5]:
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Clean Up Data and Create Dummy Variables

In [6]:
#Ideally SeniorCitizen column should be a factor, so let's convert 1,0 values to Yes,No and later we can label encode all factor columns
df.SeniorCitizen=df.SeniorCitizen.apply(lambda x: 'Yes' if x==1 else 'No')

# Getting rid of unwanted columns like Customer Id.
if 'customerID' in df:
    del df['customerID']
    
# converting TotalCharges object dataset into numeric
# errors = 'coerce’ means, if invalid parsing occur then set NaN
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors = 'coerce')

# Replacing blank values with nulls.
df=df.replace(r'^\s*$', np.nan, regex=True)

# Total charges has some blank values/missing values and needs to be imputed. Filling the missing values
df["TotalCharges"].fillna(df["TotalCharges"].mean(), inplace=True)

# Consolidate MultipleLines attribute
df['MultipleLines'] = df['MultipleLines'].replace('No phone service','No')

# Change all values of 'No internet service' to 'No'
df = df.replace('No internet service','No')

# Replace all yes/no values with 1/0
df = df.replace(to_replace=['Yes','No'], value=[1,0])

# Create dummy variables in the entire dataset
df = pd.get_dummies(df)

## Logistic Regression

In [8]:
# Separate the churn results from the dataset
y = df['Churn'].values
x = df.drop(columns = ['Churn'])

# Set features
features = x.columns.values

# Normalize values
scale = MinMaxScaler(feature_range = (0,1))
scale.fit(x)
x = pd.DataFrame(scale.transform(x))
x.columns = features

In [9]:
# Set up train/test split with 80/20 ratio
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 99)

# Create the model and fit
model = LogisticRegression()
fit = model.fit(x_train,y_train)
predict = model.predict(x_test)
print("Our accuracy score of the model using logistic regression is: ",metrics.accuracy_score(y_test,predict))
print(classification_report(y_test,predict))

Our accuracy score of the model using logistic regression is:  0.7998580553584103
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1023
           1       0.67      0.54      0.60       386

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [10]:
# Calculate and display weights from logistic regression
weights = pd.Series(model.coef_[0], index=x.columns.values)
print(weights.sort_values(ascending = False))

TotalCharges                               0.987159
InternetService_Fiber optic                0.822797
Contract_Month-to-month                    0.698867
PaperlessBilling                           0.371334
PaymentMethod_Electronic check             0.292634
MultipleLines                              0.240501
SeniorCitizen                              0.233807
StreamingTV                                0.181166
StreamingMovies                            0.180732
gender_Female                              0.016114
InternetService_DSL                        0.014568
Contract_One year                          0.007922
MonthlyCharges                             0.000813
gender_Male                               -0.015305
PaymentMethod_Mailed check                -0.020220
DeviceProtection                          -0.023484
Partner                                   -0.056451
PaymentMethod_Credit card (automatic)     -0.118700
Dependents                                -0.131897
OnlineBackup