In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
# Load the dataset
test_feature = pd.read_csv("question-2-test-features.csv", header=None)
test_label = pd.read_csv("question-2-test-labels.csv", header=None)
train_feature = pd.read_csv("question-2-train-features.csv", header=None)
train_label = pd.read_csv("question-2-train-labels.csv", header=None);

In [3]:
def append_one(x):
    b = np.ones((np.shape(x)[0],np.shape(x)[1]+1))
    b[:,1:] = x
    return b

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def div(n, d):
    return n / d if d else 0

In [4]:
# Obtain training data and label
x_train = append_one(train_feature.values)
y_train = train_label.values
y_train[y_train < 190] = 0 
y_train[y_train >= 190] = 1
y_train = np.transpose(y_train)[0]

In [5]:
# Obtain test data and label
x_test = append_one(test_feature.values)
y_test = test_label.values
y_test[y_test < 190] = 0 
y_test[y_test >= 190] = 1
y_test = np.transpose(y_test)[0]

In [6]:
############################################### LOGISTIC REGRESSION ############################################################

In [7]:
def logistic_regression(x_train,y_train,x_test,y_test,epoch,batch_size,learning_rate):
    weights = np.zeros(x_train.shape[1])
    sample_count = x_train.shape[0]
    # Gradient Ascent
    for e in range(epoch):
        gradient = 0
        for i in range(sample_count):
            difference = y_train[i] - sigmoid(np.dot(x_train[i],weights))
            gradient += np.dot(x_train[i].T, difference)
            if (i != 0 and i%batch_size == 0) or i == sample_count-1:
                weights += learning_rate*gradient
                gradient = 0             
    return weights

In [8]:
def calculate_performance(x,y,weights,is_test):
    y_pred = np.dot(x,weights)
    y_pred[y_pred >= 0] = 1
    y_pred[y_pred < 0] = 0
    accuracy = np.sum(y_pred==y) / len(y) * 100
    if not is_test:
        print("Training accuracy = " + str(accuracy))
    else:
        print("Test accuracy = " + str(accuracy))
        # Confusion Matrix values
        tp = int(np.sum(y_pred * y))
        tn = int(np.sum(y_pred+y==0))
        fp = int(np.sum(y_pred-y==1))
        fn = int(np.sum(y-y_pred==1))
        
        print("True Positive: " + str(tp))
        print("True Negative: " + str(tn))
        print("False Positive: " + str(fp))
        print("False Negative: " + str(fn))
        print("Total = " + str(tp+fp+fn+tn))
        print("Precision: " + str(div(tp,(tp+fp))))
        print("Recall: " + str(div(tp,(tp+fn))))
        print("NPV: " + str(div(tn,(tn+fn))))
        print("FPR: " + str(div(fp,(fp+tn))))
        print("FDR: " + str(div(fp,(tp+fp))))
        print("F1: " + str( div( (2*div(tp,(tp+fp))), (div(tp,(tp+fp))+div(tp,(tp+fn)))   ) ) )
        print("F2: " + str( div( (5*div(tp,(tp+fp))), (4*div(tp,(tp+fp))+div(tp,(tp+fn)))   ) ) )
        print()
    return y_pred

In [9]:
################################################# PERFORMANCE ##############################################################

In [15]:
learning_rates = [1e-3, 1e-2, 1e-1]

In [16]:
# Full Batch

for i in range(len(learning_rates)):
    print("Learning rate = " + str(learning_rates[i]))
    w = logistic_regression(x_train,y_train,x_test,y_test,epoch=1000,batch_size=14000,learning_rate=learning_rates[i])
    calculate_performance(x_train,y_train,w,False)
    calculate_performance(x_test,y_test,w,True)

Learning rate = 0.001
Training accuracy = 63.07142857142857
Test accuracy = 47.75014801657786
True Positive: 0
True Negative: 1613
False Positive: 0
False Negative: 1765
Total = 3378
Precision: 0
Recall: 0.0
NPV: 0.4775014801657786
FPR: 0.0
FDR: 0
F1: 0
F2: 0

Learning rate = 0.01
Training accuracy = 63.07142857142857
Test accuracy = 47.75014801657786
True Positive: 0
True Negative: 1613
False Positive: 0
False Negative: 1765
Total = 3378
Precision: 0
Recall: 0.0
NPV: 0.4775014801657786
FPR: 0.0
FDR: 0
F1: 0
F2: 0

Learning rate = 0.1
Training accuracy = 63.07142857142857
Test accuracy = 47.75014801657786
True Positive: 0
True Negative: 1613
False Positive: 0
False Negative: 1765
Total = 3378
Precision: 0
Recall: 0.0
NPV: 0.4775014801657786
FPR: 0.0
FDR: 0
F1: 0
F2: 0



In [20]:
optimal_learning_rate = 0.001

In [21]:
# Mini Batch
w = logistic_regression(x_train,y_train,x_test,y_test,epoch=1000,batch_size=32,learning_rate=optimal_learning_rate)
calculate_performance(x_train,y_train,w,False)
calculate_performance(x_test,y_test,w,True)

Training accuracy = 68.79285714285714
Test accuracy = 76.79100059206631
True Positive: 1398
True Negative: 1196
False Positive: 417
False Negative: 367
Total = 3378
Precision: 0.7702479338842976
Recall: 0.7920679886685552
NPV: 0.7651951375559821
FPR: 0.25852448853068816
FDR: 0.22975206611570248
F1: 0.9860335195530727
F2: 0.9943661971830985



array([1., 1., 1., ..., 1., 1., 1.])