# Lab 3: Extending Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Dataset Preparation

In [2]:
df = pd.read_csv('Diabetes_Dataset/diabetes_012_health_indicators_BRFSS2015.csv')
print(df.head())
print(df.columns)

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [3]:
# Creating target features by modifying first column in original dataframe such that we have 3 features consisting of binary values for
# no diabetes, prediabetes, and diabetes where 0 is False and 1 is True
target_array = np.zeros((len(df),4))
for i in range(len(df)):
    target_array[i,0] = df['Diabetes_012'].values[i]
for i in range(len(target_array)):
    # no diabetes
    if target_array[i,0] == 0:
        target_array[i,1] = 1
    # prediabetes
    if target_array[i,0] == 1:
        target_array[i,2] = 1
    # diabetes
    if target_array[i,0] == 2:
        target_array[i,3] = 1

# Adding new target columns to original dataframe
target_columns = ['NoDiabetes', 'PreDiabetes', 'Diabetes']
for i in range(target_array.shape[1]-1):
    df.insert(i, target_columns[i], target_array[:,1:][:,i], True)
df_target = df.drop('Diabetes_012', axis=1)
print(df_target)

        NoDiabetes  PreDiabetes  Diabetes  HighBP  HighChol  CholCheck   BMI  \
0              1.0          0.0       0.0     1.0       1.0        1.0  40.0   
1              1.0          0.0       0.0     0.0       0.0        0.0  25.0   
2              1.0          0.0       0.0     1.0       1.0        1.0  28.0   
3              1.0          0.0       0.0     1.0       0.0        1.0  27.0   
4              1.0          0.0       0.0     1.0       1.0        1.0  24.0   
...            ...          ...       ...     ...       ...        ...   ...   
253675         1.0          0.0       0.0     1.0       1.0        1.0  45.0   
253676         0.0          0.0       1.0     1.0       1.0        1.0  18.0   
253677         1.0          0.0       0.0     0.0       0.0        1.0  28.0   
253678         1.0          0.0       0.0     1.0       0.0        1.0  23.0   
253679         0.0          0.0       1.0     1.0       1.0        1.0  25.0   

        Smoker  Stroke  HeartDiseaseorA

In [4]:
# Need to remove the target column of diabetes in original dataframe
columns = list(df_target.columns)
targets = ['NoDiabetes', 'PreDiabetes', 'Diabetes']
for col in targets:
    columns.remove(col)
    
# Splitting dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.20, random_state=42, shuffle=True)

X_test  = test[columns].to_numpy()
X_train = train[columns].to_numpy()
y_test  = {}
y_train = {}

for col in targets:
    y_test[col]  = test[col].to_numpy()
    y_train[col] = train[col].to_numpy()

## Modeling

### Binary Logistic Regression

In [9]:
import numpy as np
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20, optimization='sd', regularization='none', C=0.001):
        self.eta = eta
        self.iters = iterations
        self.opt = optimization
        self.reg = regularization
        self.C = C
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self, X, _add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if _add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) # return the actual prediction
        
blr = BinaryLogisticRegressionBase(0.1)
print(blr)

Base Binary Logistic Regression Object, Not Trainable


In [10]:
from scipy.special import expit
from numpy.linalg import pinv

# inherit from base class
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    # private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    # creating ability to choose optimization technique
    def _get_gradient(self,X,y):
        
        gradient = None
        if self.opt == 'sd': gradient = self.steepest_descent
        elif self.opt == 'sgd': gradient = self.stochastic_gradient_descent
        elif self.opt == 'newton': gradient = self.newton
        return gradient(X,y)
    
    def steepest_descent(self,X,y):
    
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C
        
        return gradient
    
    def stochastic_gradient_descent(self,X,y):
        
        idx = int(np.random.rand()*len(y)) # grab random instance
        ydiff = y[idx]-self.predict_proba(X[idx],add_bias=False) # get y difference (now scalar)
        gradient = X[idx] * ydiff[:,np.newaxis] # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C
        
        return gradient
    
    def newton(self,X,y):
        
        g = self.predict_proba(X,add_bias=False).ravel() # get sigmoid value for all classes
        hessian = X.T @ np.diag(g*(1-g)) @ X - 2 * self.C # calculate the hessian

        ydiff = y-g # get y difference
        gradient = np.sum(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C
        
        return pinv(hessian) @ gradient
    
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # regularization methods
    def _get_reg_gradient(self):
        
        if self.reg == 'none':
            return self.w_[1:]
        elif self.reg == 'L1':
            return np.sign(self.w_[1:])
        elif self.reg == 'L2':
            return -2 * self.w_[1:]
        elif self.reg == 'L1_L2':
            return -2 * self.w_[1:] + np.sign(self.w_[1:])    
        
    def fit(self, X, y):
        
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

### Logistic Regression