# 11 Iterative Modeling Logistic Regression

## 11.1 Set Up & Data Initialization 

In [18]:
import pandas as pd 
import numpy as np
import statsmodels as sm
import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy import stats  
from sklearn.metrics import plot_confusion_matrix

import pickle

In [19]:
df = pd.read_pickle("./df.pkl") 

## 11.2 Logistic Regression - 1 
In this model I will group Functional and Functional Needs Repair 

In [20]:
#classify "functional needs repair as functional" 
df['status_group'] = df['status_group'].replace({'functional needs repair' : 'functional'}, regex=True)

In [21]:
#verify data now binary
df['status_group'].nunique()

2

In [22]:
#group data by categorical / numeric
numerical = ['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 
             'population', 'construction_year']
             
categorical = ['date_recorded', 'funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
              'lga', 'ward', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 
               'extraction_type', 'extraction_type_group', 'extraction_type_group', 'extraction_type_class',
               'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 
              'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type',
               'waterpoint_type_group']             

In [23]:
#define target
y = df['status_group'] 
features = df.drop('status_group', axis=1) 

In [24]:
#get dummies
data = pd.get_dummies(features)
data.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,...,0,0,0,0,0,1,0,0,0,0
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,...,0,0,0,0,0,1,0,0,0,0
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,...,0,0,0,0,0,1,0,0,0,0
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,...,0,0,0,0,0,1,0,0,0,0
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [25]:
data.fillna(0)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,...,0,0,0,0,0,1,0,0,0,0
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,...,0,0,0,0,0,1,0,0,0,0
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,...,0,0,0,0,0,1,0,0,0,0
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,...,0,0,0,0,0,1,0,0,0,0
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,1210,37.169807,-3.253847,0,3,5,125,1999,...,0,0,0,0,0,1,0,0,0,0
59396,27263,4700.0,1212,35.249991,-9.070629,0,11,4,56,1996,...,0,0,0,0,0,1,0,0,0,0
59397,37057,0.0,0,34.017087,-8.750434,0,12,7,0,0,...,0,1,0,0,0,0,0,1,0,0
59398,31282,0.0,0,35.861315,-6.378573,0,1,4,0,0,...,0,1,0,0,0,0,0,1,0,0


In [26]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(data, y, random_state=43)

In [27]:
#fit model 
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
model_log = logreg.fit(X_train, y_train)
model_log

LogisticRegression(C=1000000000000.0, fit_intercept=False, solver='liblinear')

In [28]:
#predicitons 
y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

In [31]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.78


In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_hat_test)
print(confusion_matrix)

[[8504  708]
 [2610 3028]]


In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_hat_test))

                precision    recall  f1-score   support

    functional       0.77      0.92      0.84      9212
non functional       0.81      0.54      0.65      5638

      accuracy                           0.78     14850
     macro avg       0.79      0.73      0.74     14850
  weighted avg       0.78      0.78      0.76     14850

