In [115]:
import numpy as np
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
        
blr = BinaryLogisticRegressionBase(0.1)
print(blr)

Base Binary Logistic Regression Object, Not Trainable


In [116]:
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    #private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    def _get_gradient(self,X,y):
        # programming \sum_i (yi-g(xi))xi
        gradient = np.zeros(self.w_.shape) # set gradient to zero
        for (xi,yi) in zip(X,y):
            # the actual update inside of sum
            gradi = (yi - self.predict_proba(xi,add_bias=False))*xi 
            # reshape to be column vector and add to gradient
            gradient += gradi.reshape(self.w_.shape) 
        
        return gradient/float(len(y))
       
    # public:
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

            
blr = BinaryLogisticRegression(0.1)
print(blr)

Untrained Binary Logistic Regression Object


In [117]:
import plotly
import pandas as pd
data = pd.read_csv("./data/uber_nyc_enriched.csv")

In [118]:
data.describe()

Unnamed: 0,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd
count,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0
mean,490.215903,5.984924,8.818125,47.669042,30.823065,1017.817938,0.00383,0.026129,0.090464,2.529169
std,995.649536,3.699007,2.442897,19.814969,21.283444,7.768796,0.018933,0.093125,0.219402,4.520325
min,0.0,0.0,0.0,2.0,-16.0,991.4,0.0,0.0,0.0,0.0
25%,1.0,3.0,9.1,32.0,14.0,1012.5,0.0,0.0,0.0,0.0
50%,54.0,6.0,10.0,46.0,30.0,1018.2,0.0,0.0,0.0,0.0
75%,449.0,8.0,10.0,64.5,50.0,1022.9,0.0,0.0,0.05,2.958333
max,7883.0,21.0,10.0,89.0,73.0,1043.4,0.28,1.24,2.1,19.0


<p> checking for nan values in dataset </p>

In [119]:
data.isnull().values.any()

True

<p> We found only Borough has nan values so we remove the nan rows </p>

In [120]:
data.isnull().any()

pickup_dt    False
borough       True
pickups      False
spd          False
vsb          False
temp         False
dewp         False
slp          False
pcp01        False
pcp06        False
pcp24        False
sd           False
hday         False
dtype: bool

In [121]:
data = data.dropna()

In [122]:
data.isnull().any()

pickup_dt    False
borough      False
pickups      False
spd          False
vsb          False
temp         False
dewp         False
slp          False
pcp01        False
pcp06        False
pcp24        False
sd           False
hday         False
dtype: bool

In [123]:
data.corr()

Unnamed: 0,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd
pickups,1.0,0.009741,-0.008429,0.063692,0.040082,-0.015708,0.005007,-0.002821,-0.022935,-0.009676
spd,0.009741,1.0,0.086177,-0.296126,-0.321606,-0.092761,-0.000357,0.016668,-0.010412,0.097041
vsb,-0.008429,0.086177,1.0,0.025214,-0.231294,0.167039,-0.488407,-0.118346,0.000895,-0.047834
temp,0.063692,-0.296126,0.025214,1.0,0.896544,-0.224537,-0.013343,-0.037295,-0.014408,-0.545558
dewp,0.040082,-0.321606,-0.231294,0.896544,1.0,-0.311156,0.115399,0.013293,0.001519,-0.489372
slp,-0.015708,-0.092761,0.167039,-0.224537,-0.311156,1.0,-0.089752,-0.10494,-0.134689,0.121508
pcp01,0.005007,-0.000357,-0.488407,-0.013343,0.115399,-0.089752,1.0,0.128064,0.000997,0.00031
pcp06,-0.002821,0.016668,-0.118346,-0.037295,0.013293,-0.10494,0.128064,1.0,0.251166,0.039943
pcp24,-0.022935,-0.010412,0.000895,-0.014408,0.001519,-0.134689,0.000997,0.251166,1.0,0.069664
sd,-0.009676,0.097041,-0.047834,-0.545558,-0.489372,0.121508,0.00031,0.039943,0.069664,1.0


In [124]:
data

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
0,2015-01-01 01:00:00,Bronx,152,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
1,2015-01-01 01:00:00,Brooklyn,1519,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
2,2015-01-01 01:00:00,EWR,0,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
3,2015-01-01 01:00:00,Manhattan,5258,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
4,2015-01-01 01:00:00,Queens,405,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
5,2015-01-01 01:00:00,Staten Island,6,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
7,2015-01-01 02:00:00,Bronx,120,3.0,10.0,30.0,6.0,1023.0,0.0,0.0,0.0,0.0,Y
8,2015-01-01 02:00:00,Brooklyn,1229,3.0,10.0,30.0,6.0,1023.0,0.0,0.0,0.0,0.0,Y
9,2015-01-01 02:00:00,EWR,0,3.0,10.0,30.0,6.0,1023.0,0.0,0.0,0.0,0.0,Y
10,2015-01-01 02:00:00,Manhattan,4345,3.0,10.0,30.0,6.0,1023.0,0.0,0.0,0.0,0.0,Y


In [125]:
del data['dewp']

In [126]:
data

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,slp,pcp01,pcp06,pcp24,sd,hday
0,2015-01-01 01:00:00,Bronx,152,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
1,2015-01-01 01:00:00,Brooklyn,1519,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
2,2015-01-01 01:00:00,EWR,0,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
3,2015-01-01 01:00:00,Manhattan,5258,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
4,2015-01-01 01:00:00,Queens,405,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
5,2015-01-01 01:00:00,Staten Island,6,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,Y
7,2015-01-01 02:00:00,Bronx,120,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,Y
8,2015-01-01 02:00:00,Brooklyn,1229,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,Y
9,2015-01-01 02:00:00,EWR,0,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,Y
10,2015-01-01 02:00:00,Manhattan,4345,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,Y


In [127]:
data['hday'] = data['hday'].apply(lambda x: 0 if x=='N' else 1)

In [128]:
oneHotCols = pd.get_dummies(data['borough'])
data = data.join(oneHotCols)

In [129]:
data

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,slp,pcp01,pcp06,pcp24,sd,hday,Bronx,Brooklyn,EWR,Manhattan,Queens,Staten Island
0,2015-01-01 01:00:00,Bronx,152,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,1,0,0,0,0,0
1,2015-01-01 01:00:00,Brooklyn,1519,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0
2,2015-01-01 01:00:00,EWR,0,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0
3,2015-01-01 01:00:00,Manhattan,5258,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0
4,2015-01-01 01:00:00,Queens,405,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,0,0,0,0,1,0
5,2015-01-01 01:00:00,Staten Island,6,5.0,10.0,30.0,1023.5,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1
7,2015-01-01 02:00:00,Bronx,120,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,1,1,0,0,0,0,0
8,2015-01-01 02:00:00,Brooklyn,1229,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0
9,2015-01-01 02:00:00,EWR,0,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0
10,2015-01-01 02:00:00,Manhattan,4345,3.0,10.0,30.0,1023.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0


In [130]:
del data['borough']