# 5. Logistic regression

In [3]:
from random import seed, randrange
from math import exp

from functions import *

### Define functions
* Logistic regression using stochastic gradient descent

In [4]:
# Make a logistic prediction, provided row and coefficients
def predict_logistic(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i + 1] * row[i]
    return 1.0 / (1.0 + exp(-yhat))

# Estimate logistic regression coefficients using stochastic gradient descent (SGD)
# Returns list of coefficients, with intercept at first index
# Batch size b = 1 means stochastic gradient descent, b > 1 means (mini) batch gradient descent
def coefficients_logistic_sgd(train, l_rate, n_epoch, b = 1):
    #coef = [0.0 for row in train]
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        # Sum squared errors to track epoch performance
        sum_error = 0
        # Counter to track mini batch
        i = 0
        # Counter to track overall row, to capture mid-batch end of dataset
        j = 0
        # SUM(h - y) * Xi over coefficients i, to multiply full batch by learning rate
        adjustments = [0 for col in range(len(train[0]))]
        for row in train:
            i += 1
            j += 1
            yhat = predict_logistic(row, coef)
            error = row[-1] - yhat
            # Add to overall epoch error
            sum_error += error**2
            # Add intercept error, since has no corresponding input Xi
            adjustments[0] += error
            for k in range(len(row)-1):
                # Begin with first coefficient, skipping intercept at k = 0
                adjustments[k + 1] += error * row[k]

            # Check whether this row is last in batch, or in overall dataset
            if i == b or j == len(train):
                # Iterate over all coefficients, including intercept
                for k in range(len(adjustments)):
                    coef[k] = coef[k] + l_rate * (1/i) * adjustments[k] * yhat * (1.0 - yhat)

                adjustments = [0 for col in range(len(train[0]))]
                i = 0
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch+1, l_rate, sum_error))
    return coef

# Logistic Regression Algorithm with Stochastic Gradient Descent
# Parameters: learning rate, number of epochs, (mini-) batch size
def logistic_regression_sgd(train, test, l_rate, n_epoch, b_size = 1):
    # Empty list to hold predictions matching test actual y's
    predictions = list()
    # Estimate coefficients using provided parameters and SGD
    coef = coefficients_logistic_sgd(train, l_rate, n_epoch, b_size)
    for row in test:
        # Predict yhat using estimated coefficients
        yhat = predict_logistic(row, coef)
        # Round to 0 or 1
        yhat = round(yhat)
        # Add integer yhat to list of predictions
        predictions.append(yhat)
    return predictions



### Testing logistic regression

In [5]:
# Contrived dataset for prediction testing
dataset =   [[2.7810836,2.550537003,0],
            [1.465489372,2.362125076,0],
            [3.396561688,4.400293529,0],
            [1.38807019,1.850220317,0],
            [3.06407232,3.005305973,0],
            [7.627531214,2.759262235,1],
            [5.332441248,2.088626775,1],
            [6.922596716,1.77106367,1],
            [8.675418651,-0.242068655,1],
            [7.673756466,3.508563011,1]]
# Hardcoded coefficients for testing
coef = [-0.406605464, 0.852573316, -1.104746259]
# Iterating over rows and predicting using "provided" coefficients
for row in dataset:
    yhat = predict_logistic(row, coef)
    print("Expected=%.3f, Predicted=%.3f [%d]" % (round(row[-1]), yhat, round(yhat)))

# Testing estimating coefficients using SGD on contrived dataset
print()
l_rate = 0.3
n_epoch = 100
coef = coefficients_logistic_sgd(dataset, l_rate, n_epoch)
print(coef)

# Logistic Regression using Stochastic Gradient Descent for Pima Indians
seed(1)
print("\nLinear Regression With Stochastic Gradient Descent for Wine Quality:")

# Load data and convert all columns to float
dataset = load_csv('data/pima-indians-diabetes.csv')
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

# Normalize data (to between 0 and 1)
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

# Evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
batch_size = 10
scores = evaluate_algorithm(dataset, logistic_regression_sgd, n_folds, accuracy_metric, l_rate, n_epoch, batch_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


Expected=0.000, Predicted=0.299 [0]
Expected=0.000, Predicted=0.146 [0]
Expected=0.000, Predicted=0.085 [0]
Expected=0.000, Predicted=0.220 [0]
Expected=0.000, Predicted=0.247 [0]
Expected=1.000, Predicted=0.955 [1]
Expected=1.000, Predicted=0.862 [1]
Expected=1.000, Predicted=0.972 [1]
Expected=1.000, Predicted=0.999 [1]
Expected=1.000, Predicted=0.905 [1]

>epoch=1, lrate=0.300, error=2.217
>epoch=2, lrate=0.300, error=1.613
>epoch=3, lrate=0.300, error=1.113
>epoch=4, lrate=0.300, error=0.827
>epoch=5, lrate=0.300, error=0.623
>epoch=6, lrate=0.300, error=0.494
>epoch=7, lrate=0.300, error=0.412
>epoch=8, lrate=0.300, error=0.354
>epoch=9, lrate=0.300, error=0.310
>epoch=10, lrate=0.300, error=0.276
>epoch=11, lrate=0.300, error=0.248
>epoch=12, lrate=0.300, error=0.224
>epoch=13, lrate=0.300, error=0.205
>epoch=14, lrate=0.300, error=0.189
>epoch=15, lrate=0.300, error=0.174
>epoch=16, lrate=0.300, error=0.162
>epoch=17, lrate=0.300, error=0.151
>epoch=18, lrate=0.300, error=0.142


>epoch=18, lrate=0.100, error=132.056
>epoch=19, lrate=0.100, error=131.565
>epoch=20, lrate=0.100, error=131.084
>epoch=21, lrate=0.100, error=130.614
>epoch=22, lrate=0.100, error=130.153
>epoch=23, lrate=0.100, error=129.703
>epoch=24, lrate=0.100, error=129.262
>epoch=25, lrate=0.100, error=128.831
>epoch=26, lrate=0.100, error=128.409
>epoch=27, lrate=0.100, error=127.995
>epoch=28, lrate=0.100, error=127.590
>epoch=29, lrate=0.100, error=127.194
>epoch=30, lrate=0.100, error=126.806
>epoch=31, lrate=0.100, error=126.426
>epoch=32, lrate=0.100, error=126.053
>epoch=33, lrate=0.100, error=125.689
>epoch=34, lrate=0.100, error=125.331
>epoch=35, lrate=0.100, error=124.981
>epoch=36, lrate=0.100, error=124.637
>epoch=37, lrate=0.100, error=124.301
>epoch=38, lrate=0.100, error=123.971
>epoch=39, lrate=0.100, error=123.648
>epoch=40, lrate=0.100, error=123.330
>epoch=41, lrate=0.100, error=123.019
>epoch=42, lrate=0.100, error=122.714
>epoch=43, lrate=0.100, error=122.415
>epoch=44, l

>epoch=17, lrate=0.100, error=132.378
>epoch=18, lrate=0.100, error=131.777
>epoch=19, lrate=0.100, error=131.189
>epoch=20, lrate=0.100, error=130.616
>epoch=21, lrate=0.100, error=130.055
>epoch=22, lrate=0.100, error=129.507
>epoch=23, lrate=0.100, error=128.971
>epoch=24, lrate=0.100, error=128.448
>epoch=25, lrate=0.100, error=127.936
>epoch=26, lrate=0.100, error=127.436
>epoch=27, lrate=0.100, error=126.946
>epoch=28, lrate=0.100, error=126.468
>epoch=29, lrate=0.100, error=126.000
>epoch=30, lrate=0.100, error=125.543
>epoch=31, lrate=0.100, error=125.095
>epoch=32, lrate=0.100, error=124.657
>epoch=33, lrate=0.100, error=124.229
>epoch=34, lrate=0.100, error=123.809
>epoch=35, lrate=0.100, error=123.399
>epoch=36, lrate=0.100, error=122.997
>epoch=37, lrate=0.100, error=122.604
>epoch=38, lrate=0.100, error=122.219
>epoch=39, lrate=0.100, error=121.842
>epoch=40, lrate=0.100, error=121.473
>epoch=41, lrate=0.100, error=121.111
>epoch=42, lrate=0.100, error=120.757
>epoch=43, l

*Error continues to drop, so could train for a lot more epochs or increase learning rate (perhaps with some decay scheme), but it gets the point across...*