In [5]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import math
import pandas as pd

# Define Sigmoid Function and Loss Function

In [59]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def loss(output, Y):
    return (-Y*np.log(output)-(1-Y)*np.log(1-output)).mean()

def predict_train(data, theta, threshold):   
    z = np.dot(data,theta)
    output = sigmoid(z)
    pred = (output > threshold).astype(float)
    return pred

def predict(data, theta, threshold): 
    test_data = np.array(data[[column for column in data.columns if column not in ['num', 'label']]])
    labels = np.array(data['label'])
    bias = np.ones((data.shape[0],1))
    test_data = np.concatenate((test_data, bias), axis = 1)
    z = np.dot(test_data,theta)
    output = sigmoid(z)
    pred = (output > threshold).astype(float)
    return pred

# Train Logistic Regression

In [70]:
def logisticRegression(data, alpha, num_iters):#alpha is learning rate
    
    train_data = np.array(data[[column for column in data.columns if column not in ['num', 'label']]])
    labels = np.array(data['label'])
    bias = np.ones((data.shape[0],1))
    train_data = np.concatenate((train_data, bias), axis = 1)
    #theta's length is 1 + original column number
    theta = np.ones(train_data.shape[1])
    
    for iteration in range(num_iters):
        #don't do calculations in sigmoid function because
        z = np.dot(train_data,theta)
        output = sigmoid(z)
        grad = -np.dot(train_data.T,(output - labels))/labels.size
        theta += alpha * grad
        if iteration%1000==0:
            z = np.dot(train_data, theta)
            output = sigmoid(z)
            print ("{} iterations, loss is {}".format(iteration, loss(output, labels)))
            print ("Train accuracy is {}".format((predict_train(train_data, theta, 0.5) == labels).mean())) 
            
    result = {'theta':theta}
    print('parameters:', theta)
    return result

# Train & Test

In [71]:
data = pd.read_csv('watermelon.csv')
model = logisticRegression(data.iloc[:15,:], alpha=0.01, num_iters = 10000)

0 iterations, loss is 0.9095877460739221
Train accuracy is 0.5333333333333333
1000 iterations, loss is 0.645078909348371
Train accuracy is 0.6666666666666666
2000 iterations, loss is 0.6336890071820886
Train accuracy is 0.7333333333333333
3000 iterations, loss is 0.6240511279507515
Train accuracy is 0.7333333333333333
4000 iterations, loss is 0.6154058983197058
Train accuracy is 0.7333333333333333
5000 iterations, loss is 0.6076303398220139
Train accuracy is 0.7333333333333333
6000 iterations, loss is 0.6006181910489847
Train accuracy is 0.6666666666666666
7000 iterations, loss is 0.5942771156356922
Train accuracy is 0.6666666666666666
8000 iterations, loss is 0.5885269838552352
Train accuracy is 0.6666666666666666
9000 iterations, loss is 0.5832982925255191
Train accuracy is 0.6666666666666666
parameters: [ 1.65874305  2.91677698 -1.30671982]


In [69]:
test_data = data.iloc[15:,:]
test_labels = data.iloc[15:,:]['label']
print ("The test accuracy is {}".format((predict(test_data, model['theta'], 0.5) == test_labels).mean()))

The test accuracy is 0.5
