# Logistic Regression by Example

In [69]:
#import relavent packages
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [73]:
#Setup or get the data
#x should be 2D. reshape() will convert 1D to 2D array
#-1: as many rows possible. 1: one column 
x = np.arange(159).reshape(-1,1)
y = np.array([0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,
             0,0,0,0,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,
             0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,
              0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,
              0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,
              1,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1])

In [74]:
#create a model and train it1,
#liblinear: Library for large linear classification
#default C=1
model = LogisticRegression(solver='lbfgs', random_state=0).fit(x,y)

In [75]:
#train the model by fitting it with data to find coefficients
#b0, b1, b2, etc. of a polynomial cost function
#model.fit(x,y)

In [76]:
#show all the labels in the logistic regression
model.classes_

array([0, 1])

In [77]:
#b0 is the intercept
model.intercept_

array([0.24262261])

In [78]:
#b1, b2, etc. coefecients 
model.coef_

array([[0.002275]])

In [79]:
model.n_iter_

array([17])

In [80]:
#Evaluate the model. Returns matrix of probabilities
#that predict output is either 0 or 1
model.predict_proba(x)
#in this matrix, each row is one observation
#first column is the probability that output is 0
#second column is the probability that output is 1

array([[0.43964015, 0.56035985],
       [0.43907977, 0.56092023],
       [0.43851954, 0.56148046],
       [0.43795947, 0.56204053],
       [0.43739956, 0.56260044],
       [0.4368398 , 0.5631602 ],
       [0.43628021, 0.56371979],
       [0.43572078, 0.56427922],
       [0.43516151, 0.56483849],
       [0.43460241, 0.56539759],
       [0.43404347, 0.56595653],
       [0.4334847 , 0.5665153 ],
       [0.4329261 , 0.5670739 ],
       [0.43236768, 0.56763232],
       [0.43180942, 0.56819058],
       [0.43125133, 0.56874867],
       [0.43069343, 0.56930657],
       [0.43013569, 0.56986431],
       [0.42957814, 0.57042186],
       [0.42902076, 0.57097924],
       [0.42846356, 0.57153644],
       [0.42790654, 0.57209346],
       [0.42734971, 0.57265029],
       [0.42679306, 0.57320694],
       [0.4262366 , 0.5737634 ],
       [0.42568032, 0.57431968],
       [0.42512423, 0.57487577],
       [0.42456833, 0.57543167],
       [0.42401262, 0.57598738],
       [0.42345711, 0.57654289],
       [0.

In [81]:
#Find actual predictions based on the probability matrix 
model.predict(x)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1])

In [82]:
model.score(x,y)
#provides the ratio of # of correct predictions to total # of 
#observations

0.6037735849056604

In [83]:
#confusion matrix
confusion_matrix(y, model.predict(x))

array([[ 0, 63],
       [ 0, 96]], dtype=int64)

# Let's improve the Regression

In [84]:
#lets setup regularization of training dataset. Smaller C means larger regularization 
#data playout but watch for overfitting. Higher C means, dont believe training data
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(x,y)

In [85]:
#Now collect model parameters. intercept, coefficiant, etc.
model.intercept_

array([0.23972421])

In [86]:
model.coef_

array([[0.00230258]])

In [87]:
model.predict_proba(x)

array([[0.44035432, 0.55964568],
       [0.43978694, 0.56021306],
       [0.43921972, 0.56078028],
       [0.43865266, 0.56134734],
       [0.43808576, 0.56191424],
       [0.43751902, 0.56248098],
       [0.43695245, 0.56304755],
       [0.43638604, 0.56361396],
       [0.43581979, 0.56418021],
       [0.43525371, 0.56474629],
       [0.4346878 , 0.5653122 ],
       [0.43412207, 0.56587793],
       [0.4335565 , 0.5664435 ],
       [0.4329911 , 0.5670089 ],
       [0.43242588, 0.56757412],
       [0.43186084, 0.56813916],
       [0.43129597, 0.56870403],
       [0.43073129, 0.56926871],
       [0.43016678, 0.56983322],
       [0.42960245, 0.57039755],
       [0.42903831, 0.57096169],
       [0.42847435, 0.57152565],
       [0.42791058, 0.57208942],
       [0.42734699, 0.57265301],
       [0.42678359, 0.57321641],
       [0.42622039, 0.57377961],
       [0.42565737, 0.57434263],
       [0.42509455, 0.57490545],
       [0.42453192, 0.57546808],
       [0.42396948, 0.57603052],
       [0.

In [88]:
model.predict(x)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1])

In [89]:
model.score(x,y)

0.6037735849056604

In [90]:
confusion_matrix(y, model.predict(x))

array([[ 0, 63],
       [ 0, 96]], dtype=int64)

In [91]:
#AUC computation
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)
y_pred = model.predict_proba(X_test)[::,1]
auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
print (auc)

0.4492753623188406


In [92]:
#ROC computation
from sklearn.metrics import roc_curve, roc_auc_score 
len(y_test)
len(X_test)
#the following will be run once for each algorithm
fpr, tpr, threshold = roc_curve(y_test, model.predict_proba(X_test)[:,1])
print ('roc info for logistic regression', fpr, tpr, threshold)

roc info for logistic regression [0.         0.11111111 0.11111111 0.22222222 0.22222222 0.33333333
 0.33333333 0.66666667 0.66666667 0.77777778 0.77777778 0.88888889
 0.88888889 1.         1.        ] [0.         0.         0.08695652 0.08695652 0.34782609 0.34782609
 0.47826087 0.47826087 0.65217391 0.65217391 0.73913043 0.73913043
 0.7826087  0.7826087  1.        ] [1.641184   0.641184   0.63799935 0.63373425 0.62081491 0.6126511
 0.60717287 0.59447218 0.59113731 0.5900238  0.58444263 0.58220399
 0.58052278 0.57827827 0.56361396]
