# Challenge 2 - Logistic Regression

# Before your start:

    Read the README.md file
    Comment as much as you can and use the resources (README.md file)
    Happy learning!

In [1]:
## Import the libraries for loading the data set 
## Load the dataset 
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols #Summary statistics

In [2]:
## You can use the warnings library to ignore warnings that might show when you run the code
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('Customer-Churn.csv')

In [4]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

### Data Pre-processing (Handling Numerical variables)

In [8]:
## Store the specified numerical columns data as a separate dataframe. Give it the name "numerics"
numerical = data[['MonthlyCharges', 'TotalCharges', 'tenure']].copy()

In [16]:
numerical.head(3)

Unnamed: 0,MonthlyCharges,TotalCharges,tenure
0,29.85,29.85,1
1,56.95,1889.5,34
2,53.85,108.15,2


In [15]:
import numpy as np
numerical['TotalCharges'][488] = 0
numerical = numerical.convert_objects(convert_numeric = True)
numerical['TotalCharges'].fillna(0.0, inplace = True)
numerical.isnull().sum()

MonthlyCharges    0
TotalCharges      0
tenure            0
dtype: int64

### MinMax Scaler

Hint: Since we are using "numerics" to store the nummerical variables we can pass "numerics" directly
as MinMaxScaler().fit(numerics)

In [17]:
## Import the required library
## Perform the scaling and store the results inside "numerical"
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler().fit(numerical)

In [19]:
## Convert "numerical" into a dataframe so that it can be used later with the dataframe of categorical variables
numerical = scaler.transform(numerical)

In [20]:
numerical=pd.DataFrame(numerical,columns = ['MonthlyCharges', 'TotalCharges', 'tenure'])

In [22]:
numerical.dtypes

MonthlyCharges    float64
TotalCharges      float64
tenure            float64
dtype: object

### Data Pre-processing (Handling Categorical variables)

In [23]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [24]:
## Similar to numerical variables, store the specified categorical columns data as a dataframe. 
## Give it the name "cats"
cats = data[['gender', 'SeniorCitizen', 'Partner','Dependents','Contract']].copy()

In [25]:
## Check if "cats" is actually a dataframe using 
cats.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,Contract
0,Female,0,Yes,No,Month-to-month
1,Male,0,No,No,One year
2,Male,0,No,No,Month-to-month


### Using One Hot Encoding 

In [26]:
# Perform One hot encoding and store the results (one hot encoded dataframe) into "categorical"
categorical=pd.get_dummies(cats, columns=['gender', 'SeniorCitizen', 'Partner','Dependents','Contract'])

In [27]:
## Check how the new OHE data looks like using the head() function
categorical.head(3)

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,1,0,1,0,0,1,1,0,1,0,0
1,0,1,1,0,1,0,1,0,0,1,0
2,0,1,1,0,1,0,1,0,1,0,0


In [28]:
X = pd.concat([numerical,categorical],axis=1)

In [18]:
Y = data['Churn']

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
## Import the libraries required for regression model 
## Fit the linear regression model on the data
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X,Y)

In [33]:
## Make predictions on the dataset, store the results in "predictions"
predictions = classification.predict(X)

In [34]:
## Print the measures of accuracy of the model - MSE, RMSE, and R2 score
classification.score(X, Y)

0.7905721993468693

In [35]:
mean_squared_error(pd.DataFrame(Y).replace(['No','Yes'], [0, 1]),
                   pd.DataFrame(predictions).replace(['No','Yes'], [0, 1]))

0.20942780065313077