# Challenge 2 - Logistic Regression

# Before your start:

    Read the README.md file
    Comment as much as you can and use the resources (README.md file)
    Happy learning!

In [1]:
import pandas as pd
import numpy as np
churn = pd.read_csv('Customer-churn.csv')
## Import the libraries for loading the data  

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Data Pre-processing (Handling Numerical variables)

In [3]:
numerics = churn[['MonthlyCharges', 'TotalCharges', 'tenure']]
numerics.isna().sum()

MonthlyCharges    0
TotalCharges      0
tenure            0
dtype: int64

In [4]:
type(numerics['TotalCharges'][0])

str

In [5]:
#Total Charges column currently is a string, so we need to convert it into numerical category. 
numerics['TotalCharges'] =  pd.to_numeric(numerics['TotalCharges'],errors='coerce')
numerics.isna().sum()

MonthlyCharges     0
TotalCharges      11
tenure             0
dtype: int64

### MinMax Scaler

Hint: Since we are using "numerics" to store the nummerical variables we can pass "numerics" directly
as MinMaxScaler().fit(numerics)

In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

numericals = MinMaxScaler().fit_transform(numerics[['MonthlyCharges', 'TotalCharges', 'tenure']])

In [7]:
## Convert "numerical" into a dataframe so that it can be used later with the dataframe of categorical variables

In [8]:
numerical = pd.DataFrame(numericals)
numerical.head()

Unnamed: 0,0,1,2
0,0.115423,0.001275,0.013889
1,0.385075,0.215867,0.472222
2,0.354229,0.01031,0.027778
3,0.239303,0.210241,0.625
4,0.521891,0.01533,0.027778


### Data Pre-processing (Handling Categorical variables)

In [9]:
cats = churn[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Contract']]
cats.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,Contract
0,Female,0,Yes,No,Month-to-month
1,Male,0,No,No,One year
2,Male,0,No,No,Month-to-month
3,Male,0,No,No,One year
4,Female,0,No,No,Month-to-month


In [10]:
type(cats['SeniorCitizen'][0])

numpy.int64

In [11]:
cats.SeniorCitizen = cats.SeniorCitizen.astype('str')

### Using One Hot Encoding 

In [12]:
categorical = pd.get_dummies(cats)
categorical.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,1,0,1,0,0,1,1,0,1,0,0
1,0,1,1,0,1,0,1,0,0,1,0
2,0,1,1,0,1,0,1,0,1,0,0
3,0,1,1,0,1,0,1,0,0,1,0
4,1,0,1,0,1,0,1,0,1,0,0


In [14]:
X = pd.concat([numerical,categorical,churn['Churn']],axis=1)

In [15]:
type(X['Churn'][0])

str

In [16]:
X['Churn'] = np.where(X['Churn']=='Yes',1,0) #converting integers into binaries

In [17]:
X.dropna(inplace=True) #dropping the rows containing NaNs
X = X.reset_index()

In [18]:
Y = X['Churn'] #defining Y

In [19]:
X.drop('Churn',axis=1).head() 

Unnamed: 0,index,0,1,2,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,0.115423,0.001275,0.013889,1,0,1,0,0,1,1,0,1,0,0
1,1,0.385075,0.215867,0.472222,0,1,1,0,1,0,1,0,0,1,0
2,2,0.354229,0.01031,0.027778,0,1,1,0,1,0,1,0,1,0,0
3,3,0.239303,0.210241,0.625,0,1,1,0,1,0,1,0,0,1,0
4,4,0.521891,0.01533,0.027778,1,0,1,0,1,0,1,0,1,0,0


In [20]:
## Fit the linear regression model on the data
from sklearn import linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)

In [21]:
predictions  = lm.predict(X)

In [22]:
## Print the measures of accuracy of the model - MSE, RMSE, and R2 score

In [23]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print("MSE:", mean_squared_error(Y, predictions))
print("RMSE", math.sqrt(mean_squared_error(Y, predictions)))
print("R2:", r2_score(Y, predictions))

MSE: 1.3584814085936638e-31
RMSE 3.6857582782836747e-16
R2: 1.0
