In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from patsy import dmatrices   #used to create dummyvariables

In [68]:
data = pd.read_csv("./data/HR_comma_sep.csv")

In [69]:
data.left.head()

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [70]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [71]:
type(data.left[0])

numpy.int64

In [72]:
data.left = data.left.astype(int) #why already int

In [73]:
data.rename(columns={"average_montly_hours":"average_monthly_hours"}, inplace=True)
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# extract columns and use pasty.dmatrices to create dummy variables

In [74]:
y, X = dmatrices("left~satisfaction_level+last_evaluation+number_project+average_monthly_hours+time_spend_company+Work_accident+promotion_last_5years+C(sales)+C(salary)", data, return_type="dataframe")

In [75]:
type(X)

pandas.core.frame.DataFrame

In [76]:
type(y)

pandas.core.frame.DataFrame

In [77]:
X = np.asmatrix(X) # why doing this
y = np.ravel(y)

# make all columns of X range from 0 to 1

In [78]:
for i in range(1, X.shape[1]): # traverse over columns from column 1 to the end
    xmin = X[:,i].min()
    xmax = X[:,i].max()
    X[:,i] = (X[:,i] - xmin)/(xmax-xmin)

In [79]:
X

matrix([[1.   , 0.   , 0.   , ..., 0.125, 0.   , 0.   ],
        [1.   , 0.   , 0.   , ..., 0.5  , 0.   , 0.   ],
        [1.   , 0.   , 0.   , ..., 0.25 , 0.   , 0.   ],
        ...,
        [1.   , 0.   , 0.   , ..., 0.125, 0.   , 0.   ],
        [1.   , 0.   , 0.   , ..., 0.25 , 0.   , 0.   ],
        [1.   , 0.   , 0.   , ..., 0.125, 0.   , 0.   ]])

# start the core of the algorithm

In [80]:
np.random.seed(1)
alpha = 1
beta = np.random.randn(X.shape[1]) # create a rand one d array with # of items same as the number of columns of X
print("beta is " + str(beta))

beta is [ 1.62434536 -0.61175641 -0.52817175 -1.07296862  0.86540763 -2.3015387
  1.74481176 -0.7612069   0.3190391  -0.24937038  1.46210794 -2.06014071
 -0.3224172  -0.38405435  1.13376944 -1.09989127 -0.17242821 -0.87785842
  0.04221375]


In [81]:
for T in range(500):
    prob = np.array(1./(1+np.exp(-np.matmul(X, beta)))).ravel()   # calculate the probability of left with random beta
    
    prob_y = list(zip(prob, y)) # match the beta probabilty with labels
    
    loss = -sum([np.log(p) if y == 1 else np.log(1-p) for p, y in prob_y]) / len(y)
    error_rate = 0
    
    # calculate error rate
    for i in range(len(y)):
        if(( prob[i] > 0.5  and y[i] == 0) or (prob[i] <= 0.5 and y[i] == 1 ) ):
            error_rate += 1
    error_rate /= len(y)
    
    if T % 5 == 0:
        print( "T = " + str(T) + " loss = " + str(loss) + " error = " + str(error_rate))
    
    deriv = np.zeros(X.shape[1]) # create a one d array with all zeros
    for i in range(len(y)):
        #print(i)
        #print(prob.shape) # only one column
        #print(y.shape)
        deriv += np.asarray(X[i, :]).ravel() * (prob[i] - y[i])
    deriv /= len(y)
    
    beta -= alpha * deriv

T = 0 loss = 1.120382327806672 error = 0.5037002466831122
T = 5 loss = 0.6492666637968594 error = 0.2910194012934196
T = 10 loss = 0.6095807663133693 error = 0.26668444562970867
T = 15 loss = 0.5816449211566241 error = 0.25888392559503964
T = 20 loss = 0.5607552377630786 error = 0.2526835122341489
T = 25 loss = 0.5450244708251246 error = 0.2481498766584439
T = 30 loss = 0.5328795073088821 error = 0.24854990332688845
T = 35 loss = 0.5231836380769039 error = 0.2474164944329622
T = 40 loss = 0.5151826947262421 error = 0.24234948996599773
T = 45 loss = 0.5083944951258156 error = 0.23968264550970064
T = 50 loss = 0.5025111703564583 error = 0.23728248549903327
T = 55 loss = 0.49733135447626586 error = 0.23581572104806986
T = 60 loss = 0.4927180171532152 error = 0.23448229881992133
T = 65 loss = 0.48857356128232937 error = 0.23381558770584707
T = 70 loss = 0.48482536350449806 error = 0.23214880992066136
T = 75 loss = 0.48141731701862545 error = 0.22934862324154945
T = 80 loss = 0.478304757976

T = 135 loss = 0.456309175882234 error = 0.22021468097873192
T = 140 loss = 0.4550275985604056 error = 0.22101473431562105
loss down, error up
why