In [237]:
class LogReg():
    """
    Implementation of binary logistic regression
    """
    
    def __init__(self,dataframe,y_column):
        """
        Initialize LogReg class with a pandas dataframe and the name of the dependent variable column, y.
        All other variable will be assumed to be independent variables, x.
        """
        import pandas as pd
        import numpy as np
        self.y = dataframe[y_column].to_numpy()
        self.x = dataframe.drop(y_column,axis=1).to_numpy()
        self.weights = np.random.uniform(low=0.001, high=0.1, size=(1,self.x.shape[1]))[0]
        self.bias = np.random.uniform(low=0.001, high=0.1, size=(1))[0]
        
    def forward(self,row):
        """
        Produce a predicted y-value given a row (vector of x values).
        The prediction is the dot product of weights and vector x added to the bias.
        """
        x_vars = self.x[row]
        z = np.dot(x_vars,self.weights) + self.bias
        y_pred = 1/(1 + np.exp(-z))
        return y_pred
    
    def loss(self,row):
        """
        Standard logistic regression loss function. 
        
        When y is 1, the loss function evaluates to the natural log of y_pred.
            When y_pred is near 1, ln(1) will assign a loss near 0.
            When y_pred is near 0, ln(0) will assign a penalty >0.
            
        When y is 0, the loss function evaluates to natural log of (1-y_pred)
            When y_pred is near 0, ln(1-0) will assign a penalty near 0.
            When y_pred is near 1, ln(1-0) will assign a penalty >0.    
            
        """
        y_pred = self.forward(row)
        y_true = self.y[row]
        loss = -1*(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))
        return loss
        
    def cost(self):
        """
        Cost function finds average loss function applied to all rows.
        """
        return sum([self.loss(row) for row in range(self.x.shape[0])])/self.x.shape[0]
    
    def gradient_loss_y_pred(self,row):
        """
        Function that returns the gradient of the loss function with respect to y_pred
        """
        y_true = self.y[row]
        y_pred = self.forward(row)
        return -1*y_true/y_pred + (1-y_true)/(1-y_pred)
    
    def gradient_y_pred_z(self,row):
        """
        Function that returns the gradient of y_pred with respect to z (dot product(weights.X) + bias)
        """
        y_pred = self.forward(row)
        return y_pred*(1-y_pred)
    
    def gradient_z_w(self,row):
        """
        Function that returns the gradient of z with respect to weights. Note, this is not the dot product;
        each weight will be multiplied by its corresponding x-value, but not summed.
        """
        return self.weights * self.x[row]
    
    def gradient_weights(self,row):
        """
        Function that returs the gradient of loss with respect to weights by stringing together the above 3 gradients
        by observing the chain rule.
        """
        return self.gradient_loss_y_pred(row)*self.gradient_y_pred_z(row)*self.gradient_z_w(row)
    
    def gradient_bias(self,row):
        """
        Function that returns the gradient of loss with respect to bias by observing the chain rule, similar to above.
        Note, the gradient of z with respect to bias evaluates to 1; thus there are only 2 functions to determine the
        this gradient
        """
        return self.gradient_loss_y_pred(row)*self.gradient_y_pred_z(row)
    
    def train_model(self, iterations, lr):
        """
        Function that trains the LogReg model via gradient descent. Given a number of iterations and a learning rate,
        this function will begin an epoch (iteration), initialize weight and bias gradients to zero, iterate through
        rows in training data. Once all rows have been operated on, the gradients will be applied to the weights
        and biases according to the learning rate specified.
        """
        for epoch in range(iterations):
            d_w = np.zeros_like(self.weights)
            d_b = 0.0

            for i in range(self.x.shape[0]):
                d_w += self.gradient_weights(i)
                d_b += self.gradient_bias(i)
            d_w = d_w/self.x.shape[0]
            d_b = d_b/self.x.shape[0]
            
            self.weights -= lr*d_w
            self.bias -= lr*d_b
            print(self.cost())
            
    def predict(self,row):
        z = np.dot(row,self.weights) + self.bias
        y_pred = 1/(1 + np.exp(-z))
        if y_pred >= 0.5:
            return 1
        else:
            return 0

In [141]:
nba = pd.read_csv("nba_logreg.csv")

In [143]:
nba.drop('Name',inplace=True,axis=1)

In [150]:
train = nba.iloc[:800]
test = nba.iloc[800:]

In [238]:
model = LogReg(dataframe=train,y_column='TARGET_5Yrs')

In [239]:
model.train_model(iterations=1000,lr=0.01)

2.1495173114638693
1.8267506105858748
1.580926903238841
1.394591854812332
1.2535346068198174
1.1463594000973145
1.064234443128734
1.0005445639607213
0.950454130391027
0.9104732777590155
0.8780911952522117
0.8514945829379476
0.8293630521032277
0.8107246779600089
0.7948553040808665
0.781208528983325
0.7693668516467952
0.7590073323032747
0.7498772273519524
0.7417765177708161
0.7345452412790036
0.7280542043338426
0.7221980967660631
0.7168903325503326
0.7120591438622789
0.7076445945994236
0.7035962753001418
0.6998715079739322
0.6964339360968046
0.6932524081571829
0.6903000868471492
0.6875537331195893
0.684993126814057
0.6826005947320942
0.6803606238479822
0.6782595424279603
0.676285255663304
0.6744270253313988
0.6726752852225464
0.6710214857816871
0.6694579627403666
0.6679778255480245
0.6665748622227623
0.6652434578814893
0.6639785247169913
0.6627754415942537
0.6616300017629843
0.66053836744469
0.6594970302643647
0.6585027766688565
0.6575526576146042
0.6566439619226421
0.6557741927936676
0.

0.6230635066513642
0.623048408536049
0.6230333250240925
0.6230182560105978
0.6230032013922806
0.6229881610674368
0.6229731349359242
0.6229581228991288
0.6229431248599478
0.6229281407227587
0.622913170393396
0.6228982137791331
0.6228832707886491
0.6228683413320152
0.6228534253206661
0.6228385226673769
0.6228236332862469
0.6228087570926751
0.6227938940033338
0.6227790439361551
0.6227642068103091
0.6227493825461801
0.6227345710653469
0.6227197722905671
0.6227049861457611
0.6226902125559748
0.6226754514473856
0.6226607027472665
0.6226459663839767
0.6226312422869392
0.6226165303866268
0.6226018306145431
0.6225871429032044
0.6225724671861247
0.622557803397803
0.6225431514736985
0.6225285113502221
0.6225138829647188
0.6224992662554505
0.6224846611615835
0.6224700676231729
0.6224554855811459
0.6224409149772936
0.6224263557542471
0.622411807855471
0.6223972712252491
0.6223827458086695
0.62236823155161
0.6223537284007269
0.6223392363034411
0.6223247552079294
0.6223102850631071
0.622295825818615


0.6172897792304988
0.617277930584878
0.6172660847686501
0.6172542417511119
0.6172424015014458
0.6172305639887194
0.6172187291818831
0.6172068970497687
0.6171950675610964
0.6171832406844723
0.6171714163883855
0.6171595946412102
0.6171477754112116
0.6171359586665389
0.6171241443752254
0.6171123325051991
0.6171005230242718
0.6170887159001441
0.6170769111004087
0.6170651085925455
0.6170533083439264
0.6170415103218143
0.6170297144933617
0.6170179208256183
0.6170061292855196
0.6169943398399038
0.6169825524554955
0.6169707670989174
0.6169589837366903
0.6169472023352267
0.6169354228608401
0.6169236452797422
0.6169118695580391
0.6169000956617428
0.616888323556762
0.6168765532089087
0.6168647845838965
0.6168530176473408
0.6168412523647616
0.61682948870159
0.6168177266231543
0.6168059660946948
0.6167942070813595
0.6167824495482042
0.6167706934601999
0.6167589387822212
0.6167471854790594
0.616735433515421
0.6167236828559264
0.6167119334651081
0.6167001853074222
0.616688438347238
0.6166766925488452

In [241]:
score = 0
count = 0
for idx,row in test.iterrows():
    x_vars = row[:-1]
    y_true = row[-1]
    y_pred = model.predict(x_vars)
    if y_pred == y_true:
        score+=1
    count +=1
print(f"accuracy: {score/count}")

accuracy: 0.6592592592592592
