In [1]:
import numpy as np
from numpy.linalg import eig
from numpy.linalg import inv,pinv
import pandas as pd
import csv
from collections import defaultdict
from functools import partial
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from IPython.display import display
import math
%run common_functions.ipynb
%run naive_bayes_classifier.ipynb

In [57]:
class LogisticRegression:
    def __init__(self, cost_thresold = 0.001):
        self.cost_thresold = cost_thresold
    
    def add_ones(self, X):
        return np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    
    
    def sigmoid(self, x, theta):
        z = np.dot(x, theta.T)
        return 1/(1+np.exp(-z))
    
    
    def findCost(self, theta, X, y):
        m = len(y)
        temp = self.sigmoid(X, theta)
        total_cost = np.multiply(y, np.log(temp)) + np.multiply(1 - y, np.log(1 - temp))
        return -np.mean(total_cost)


    def findGradient(self, theta, X, y):
        m = len(y)
        return (1 / m) * np.dot(X.T, (self.sigmoid(X, theta) - y))
    
    
    def gradientDescent(self, class_index, X, y, learning_rate=0.1, maximum_iteration=5000):
        class_theta = self.theta[class_index]
        costs = []

        for i in range(maximum_iteration):
            gradient = self.findGradient(class_theta, X, y)
            cost = self.findCost(class_theta, X, y)
            class_theta -= learning_rate * gradient
            costs.append(cost)
                
            if abs(cost) < self.cost_thresold:
#                 print("loop broke by thresold")
                break

        return costs, class_theta
    
    
    def fit(self, X, y, learning_rate=0.1, maximum_iteration=5000):
        X = self.add_ones(X)
            
        (m, n) = X.shape
        self.class_labels = np.unique(y)
        k = len(self.class_labels)
        self.theta = np.zeros((k, n))
        class_costs = {}

#         Initial call to print 0% progress
        printProgressBar(0, k, prefix = 'Logistic Regression Progress:', suffix = 'Complete', length = 50)
        
        for class_label_index in range(k):
            printProgressBar(class_label_index + 1, k, prefix = 'Logistic Regression Progress:', suffix = 'Complete', length = 50)
            one_vs_all_class = (y == self.class_labels[class_label_index]).flatten()
            costs, self.theta[class_label_index] = self.gradientDescent(class_label_index, X, one_vs_all_class,learning_rate, maximum_iteration)
            class_costs[class_label_index] = costs
            
        return class_costs

    
    def make_predictions(self, X, y):
        X = self.add_ones(X)

        predictions = self.class_labels[np.argmax(self.sigmoid(X, self.theta), axis = 1)]
        error_percentage = 100 - np.mean(predictions == y.flatten()) * 100
        
        return error_percentage, predictions   

In [58]:
with open('digits.csv', 'r') as csvfile:
    digitDataset = np.asarray(list(csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC)))
    
x = digitDataset[:, :-1]
y = digitDataset[:, -1:]
print("x: ", x.shape)
print("y: ", y.shape)
print(np.unique(y))


x:  (1797, 64)
y:  (1797, 1)
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]


In [59]:
# initialize Logistic regression. Default maximum iteration for gradient descent  is 5000
# and thresold for cost is 0.001
lg = LogisticRegression()
# # initialize Naive Bayes
# nb = naive_bayes_classifier()

# # k-fold cross validaion
# k = 10
# # get the train and test indices for k-fold cross validation
# k_folded_train_indices, k_folded_test_indices = get_k_fold_indices(k, y.shape[0], shuffle=True)

# folds_errors = []

# for fold, train_indices in enumerate(k_folded_train_indices):
#     print(f'Fold#{fold + 1} of {k}:')
#     test_indices = k_folded_test_indices[fold]
    
#     train_x = x[train_indices]
#     test_x = x[test_indices]
#     train_y = y[train_indices]
#     test_y = y[test_indices]
    
#     class_costs = lg.fit(train_x, train_y)
#     nb.fit(train_x, train_y)

#     train_error, train_predictions = lg.make_predictions(train_x, train_y)
#     test_error, test_predictions = lg.make_predictions(test_x, test_y)
#     nb_train_accuracy, nb_train_predictions = nb.predict(train_x, train_y)
#     nb_test_accuracy, nb_test_predictions = nb.predict(test_x, test_y)
# #     print(train_accuracy)
#     folds_errors.append([train_error, test_error, 100 - nb_train_accuracy, 100 - nb_test_accuracy])

# errors_df = pd.DataFrame(folds_errors, columns = ["LG Train errors(%)", "LG Test errors(%)", "NB rain errors(%)", "NB Test errors(%)"])
# errors_df

In [60]:
# folds_errors = np.asarray(folds_errors)
# labels = ["lg_train", "lg_test", "nb_train", "nb_test"]

# for i in range(folds_errors.shape[1]):
#     plt.plot(folds_errors[:, i], label=labels[i])
    
# plt.xlabel("#fold")
# plt.ylabel("Error(%)")
# plt.title(f"Logistic regression and Naive Bayes Classifier on {k}-fold cross vaidation")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.)
# plt.show()

In [61]:
# 80-20 split
# train_indices, test_indices = get_train_test_indices_by_train_percentage(80, y.shape[0], shuffle=True)
# train_x = x[train_indices]
# test_x = x[test_indices]
# train_y = y[train_indices]
# test_y = y[test_indices]

# class_costs = lg.fit(train_x, train_y)

# split_train_error, split_train_predictions = lg.make_predictions(train_x, train_y)
# split_test_error, split_test_predictions = lg.make_predictions(test_x, test_y)

# split_errors = [[split_train_error, split_test_error]]

# split_errors_df = pd.DataFrame(split_errors, columns = ["LG Train errors(%)", "LG Test errors(%)"])
# print(split_errors_df)

Logistic Regression Progress: |--------------------------------------------------| 0.0% CompleteLogistic Regression Progress: |█████---------------------------------------------| 10.0% CompleteLogistic Regression Progress: |██████████----------------------------------------| 20.0% Complete



Logistic Regression Progress: |██████████████████████████████████████████████████| 100.0% Complete
   LG Train errors(%)  LG Test errors(%)
0             1.32128           3.342618
