In [115]:
import numpy as np
import pandas as pd

In [165]:
class LogisticRegression:
    
    def __init__(self, data, percentage=0.5):  # percentage = train/cv+test split
        self.data = data
        self.percentage = percentage
        
        self.train_X, self.train_y, self.sub_X, self.sub_y = self.split_data(self.data, self.percentage)
        
        self.test_X, self.test_y, self.cv_X, self.cv_y = self.split_data(pd.concat([self.sub_X,self.sub_y], axis=1), 0.5)
                
        self.thetas = self.gradient_descent(self.train_X.values, self.train_y.values, self.cv_X.values, self.cv_y.values)
        
        self.testing_accuracy = self.get_test_acc(self.test_X.values, self.test_y.values, self.thetas)
        
    def split_data(self, data, percentage=0.5):
        val = np.random.rand(len(data)) < percentage  #splits data and sorts into x, y values
        train = data[val]
        test = data[~val]

        train_X = train.iloc[:, :-1]
        train_y = train.iloc[:, -1]

        test_X = test.iloc[:, :-1]
        test_y = test.iloc[:, -1]
        return train_X, train_y ,test_X, test_y
    
    def predict_proba(self, X, theta):
        return self.sigmoid(np.dot(X, theta))

    def predict(self, X, theta):
        prediction = self.predict_proba(X, theta)
        predict_arr = []
        for i in prediction:
            if i>=0.5:
                predict_arr.append(1)
            else:
                predict_arr.append(0)

        return predict_arr

    def accuracy(self, predict_arr, y):
        correct = 0
        for i,j in zip(predict_arr, y):
            if i==j[0]:
                correct+=1
        return correct/len(y)  # accuracy = # tp+tn / total

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def gradient(self, X, y, theta, lambdaa):  # lambdaa is regularization term
        N, D = len(X[0]), len(X[0])
        yh = self.sigmoid(np.dot(X, theta))
        grad = np.dot(X.T, yh-y) / N
        grad[1:] += lambdaa * theta[1:]
        return grad

    def gradient_descent(self, X, y, cv_X, cv_y, learning_rate=0.01, max_iter=50000, beta=0.99, reg_term=0.5):  # attempted termination condition - lack of improvement in cross validation set
        N, D = len(X[0]), len(X[0])
        theta = np.zeros((len(X[0]), 1))
        y = np.reshape(y, (-1,1))  # creates two-dimensional array
        cv_y = np.reshape(cv_y, (-1,1))
        iterate, cv_acc, prev_cv_acc, d_theta = 0, 0, 0, 0
        max_cv_acc = 0  # maximum cross validation accuracy - records thetas at highest cv_acc 
        best_theta = theta
        g = np.inf
        eps = 1e-2
        while (np.linalg.norm(g) > eps):  # can add in 'or cv_acc>=prev_cv_acc-0.03' to stop when gradient becomes too small, 0.03 gives buffer
            g = self.gradient(X, y, theta, reg_term)
            d_theta = (1-beta)*g + beta*d_theta  # momentum
            theta = theta-learning_rate*d_theta
            cv_pred = self.predict(cv_X, theta)
            prev_cv_acc = cv_acc
            cv_acc = self.accuracy(cv_pred, cv_y)
            if cv_acc > max_cv_acc:  # checks if maximum accuracy thus far
                max_cv_acc = cv_acc
                best_theta = theta
            iterate+=1
            if iterate > max_iter:  # since it may not always converge, place a hard ceiling on number of iterations
                break
        print(max_cv_acc)
        print(cv_acc)
        return best_theta
    
    def get_test_acc(self, test_X, test_y, thetas):
        test_y = np.reshape(test_y, (-1,1))
        
        return self.accuracy(self.predict(test_X, thetas), test_y)

In [None]:
new_input = pd.read_csv('ionosphere.data', header=None)
new_input[len(new_input.T)-1] = new_input[len(new_input.T)-1].map({'g': 1, 'b':0})
new_input.insert(0, column='Bias', value=1)
log_reg = LogisticRegression(new_input)
print(log_reg.testing_accuracy)