# Challenge 2 - Logistic Regression

# Before your start:

    Read the README.md file
    Comment as much as you can and use the resources (README.md file)
    Happy learning!

In [1]:
import pandas as pd
import numpy as np

## Load the dataset
df = pd.read_csv('Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
## You can use the warnings library to ignore warnings that might show when you run the code
import warnings
warnings.filterwarnings('ignore')

### Data Pre-processing (Handling Numerical variables)

In [3]:
numeric_columns = ['MonthlyCharges', 'TotalCharges', 'tenure']

## Store the specified numerical columns data as a separate dataframe. Give it the name "numerics"
numerics = df[numeric_columns]

# Convert TotalCharges from strings to floats
numerics.TotalCharges = np.where(numerics.TotalCharges.str.contains('\.'), 
                                 numerics.TotalCharges, 
                                 numerics.TotalCharges + ".0")
numerics.TotalCharges = numerics.TotalCharges.astype('float')
numerics.head()

Unnamed: 0,MonthlyCharges,TotalCharges,tenure
0,29.85,29.85,1
1,56.95,1889.5,34
2,53.85,108.15,2
3,42.3,1840.75,45
4,70.7,151.65,2


### MinMax Scaler

Hint: Since we are using "numerics" to store the nummerical variables we can pass "numerics" directly
as MinMaxScaler().fit(numerics)

In [4]:
from sklearn.preprocessing import MinMaxScaler
## Perform the scaling and store the results inside "numerical"

## Perform the scaling and store the results inside "numerical"
transformer = MinMaxScaler().fit(numerics[numeric_columns])
numerical = transformer.transform(numerics[numeric_columns])

In [5]:
## Convert "numerical" into a dataframe so that it can be used later with the dataframe of categorical variables
numerical = pd.DataFrame(numerical, columns=numeric_columns)
numerical.head()

Unnamed: 0,MonthlyCharges,TotalCharges,tenure
0,0.115423,0.003437,0.013889
1,0.385075,0.217564,0.472222
2,0.354229,0.012453,0.027778
3,0.239303,0.211951,0.625
4,0.521891,0.017462,0.027778


### Data Pre-processing (Handling Categorical variables)

In [6]:
## Similar to numerical variables, store the specified categorical columns data as a dataframe. 
## Give it the name "cats"
categorical_columns = ['gender','SeniorCitizen', 'Partner', 'Dependents', 'Contract']
cats = df[categorical_columns]

In [7]:
## Check if "cats" is actually a dataframe using cats.head(3)
cats.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,Contract
0,Female,0,Yes,No,Month-to-month
1,Male,0,No,No,One year
2,Male,0,No,No,Month-to-month


### Using One Hot Encoding 

In [8]:
# Perform One hot encoding and store the results (one hot encoded dataframe) into "categorical"
categorical = pd.get_dummies(cats, columns=categorical_columns)

In [9]:
## Check how the new OHE data looks like using the head() function
categorical.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,1,0,1,0,0,1,1,0,1,0,0
1,0,1,1,0,1,0,1,0,0,1,0
2,0,1,1,0,1,0,1,0,1,0,0
3,0,1,1,0,1,0,1,0,0,1,0
4,1,0,1,0,1,0,1,0,1,0,0


In [10]:
X = pd.concat([numerical,categorical],axis=1)
X.dtypes

MonthlyCharges             float64
TotalCharges               float64
tenure                     float64
gender_Female                uint8
gender_Male                  uint8
SeniorCitizen_0              uint8
SeniorCitizen_1              uint8
Partner_No                   uint8
Partner_Yes                  uint8
Dependents_No                uint8
Dependents_Yes               uint8
Contract_Month-to-month      uint8
Contract_One year            uint8
Contract_Two year            uint8
dtype: object

In [11]:
Y = np.where(df['Churn'] == "Yes", 1, 0)

In [12]:
from sklearn import linear_model

## Fit the linear regression model on the data
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)

In [13]:
## Make predictions on the dataset, store the results in "predictions"
predictions = lm.predict(X)

In [14]:
from sklearn.metrics import mean_squared_error, r2_score

## Print the measures of accuracy of the model - MSE, RMSE, and R2 score
print(mean_squared_error(Y, predictions))
r2_score(Y, predictions)

0.1460225450835557


0.25096939228384685