In [None]:
import pandas as pd
import numpy as np
import matplotlib.style as style

# Running all the pre-processing libraries as modules
%run Preprocessing_BCW_Data.ipynb
%run Preprocessing_Adult_Data.ipynb
%run Preprocessing_Ionosphere_Data.ipynb
%run preprocessing_mpg_dataset.ipynb

import matplotlib.pyplot as plt

In [None]:
def kFoldCrossVal(k, X, y, rate, iter):
    accuracy = 0 # Temporary accuracy variable to be compared for different fold accuracies during validation
    model = LogisticRegression() # Calling the logistic regression class to instantiate a model
    
    size = (int)(len(y)/k)
    for i in range(k):
        train_x = X[np.r_[0:size*i, size*(i+1):]] # Splitting the x and y train set according the k-fold
        train_y = y[np.r_[0:size*i, size*(i+1):]] 
        test_x = X[(i*size):size*(i+1)] # Splitting the x and y test set according the k-fold
        test_y = y[(i*size):size*(i+1)]
        # Fitting the model using the train_x features to the train y output
        (cost_history, optimal_parameters) = lr.fit(train_x, train_y, params, rate, iter) 
        # Using the previously fitted model to predict classification for test features
        y_pred = lr.predict(test_x, optimal_parameters)
        # Evaluating the percentage of classifications which are correct
        run_accuracy = lr.evaluate_acc(test_y, y_pred)
        # Adding for each fold
        accuracy = accuracy + run_accuracy
    # Returning the average accuracy for each fold
    return accuracy/k

In [None]:
class LogisticRegression:
    
    # This function defines the sigmoid function which maps the predicted classifications
    # into probabilties between 0 and 1.
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    # Applying sigmoid function to the weighted sum of the parameters.
    def find_cost(self,X, y, weights):
        squashed = self.sigmoid(X @ weights)
        cost = (1/len(y))*(((-y).T @ np.log(squashed + 1e-5))-((1-y).T @ np.log(1-squashed + 1e-5)))
        return cost

    # Running full-batch gradient descent for optimisation
    # which gives the cost which will have one local minimum during the optimization
    # based on the hyperparameters: learning rate and iterations
    def fit(self, X, y, params, learning_rate, iterations):
        cost_history = np.zeros((iterations,1))
        for i in range(iterations):
            params = params - (learning_rate/len(y)) * (X.T @ (self.sigmoid(X @ params) - y)) 
            cost_history[i] = self.find_cost(X, y, params)

        return (cost_history, params)

    # Predicts the classification of a set of features based on fit, outputs a probability and 
    # rounds to 0 or 1
    def predict(self, X, params):
        return np.round(self.sigmoid(X @ params))
    
     # This calculates the percentage of correct values in a test set and a predicted set
    def evaluate_acc(self, test_y, predicted_y):
        return (np.sum(predicted_y == test_y) / len(test_y))

# Adding potential values for iterations of gradient descent for grid search hyperparamter tuning later
iterations= []
for i in range(1,11):
    iterations.append(10*i)

# The following iteration list was used for the mpg data set since it requires more iterations for higher accuracy
# for i in range(10,100):
#     iterations.append(10*i)

# Adding potential values for learning rate into a list for grid search hyperparamter tuning later
learning_rate= [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009,
               0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,
               0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,
               0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

# Instantiated the model
lr = LogisticRegression()

'''
The following commented out code are sequences of X and y arrays which correspond to
the features and then the classifying outputs of the data sets.
The first one will initialise the numpy arrays for X and y for the adult income data set.
The second one will initialise the numpy arrays for X and y for the ionosphere data set.
The third one will initialise the numpy arrays for X and y for the MPG data set.
The fourth one will initialise the numpy arrays for X and y for the breast cancer data set.
'''
# features = df.drop(['salary'] , axis=1)
# X = features.values
# output = df['salary']
# y = output.values

# features = df_ion.drop(['classification'] , axis=1)
# X = features.values
# output = df_ion['classification']
# y = output.values

# features = DataFrame.drop(['mpg'] , axis=1)
# X = features.values
# output = DataFrame['mpg']
# y = output.values

features = df_bcw.drop(['Class'] , axis=1)
X = features.values
output = df_bcw['Class']
y = output.values


# The following plots the dataset
# sns.set_style('white')
# fig = sns.scatterplot(X[:,0],X[:,1],hue=y.reshape(-1));

y = y[:,np.newaxis]
m = len(y)
X = np.hstack((np.ones((m,1)),X))
n = np.size(X,1)
params = np.zeros((n,1))

# Hyperparameter tuning using grid search
optimal_score = 0
optimal_rate = 0
optimal_iter = 0
 # Goes through every possible combination in the grid for the learning_rate and iterations and tests to find the 
 # highest 
for rate in learning_rate:
    for iter in iterations:
        score = kFoldCrossVal(5, X, y, rate, iter)
        if score > optimal_score:
            optimal_score = score
            optimal_rate = rate
            optimal_iter = iter

print("Optimal score is {}, with rate: {} and {} iterations".format(optimal_score, optimal_rate, optimal_iter))

'''
The following tests accuracy of the data against the size of the data set. 
It splits the dataset into tenths, and then every iteration joins a tenth together, and
finally tests it for the whole dataset for the tenth iteration.
'''
# accuracy_vals = []
# size_of_dataset = []
# for i in range(1,11):
#     feats = X[:int(i*(len(y)/10))]
#     outputs = y[:int(i*(len(y)/10))]
#     accuracy = kFoldCrossVal(5, feats, outputs, optimal_rate, optimal_iter )
#     print(accuracy, len(outputs))
#     accuracy_vals.append(accuracy)
#     size_of_dataset.append(len(outputs))
    
# plt.plot(size_of_dataset,accuracy_vals,'-ro')
# plt.xlabel('Size of the Data Set')
# plt.ylabel('Accuracy Values')
# # plt.show()




In [None]:

'''
The following are the experiments ran for valuating learning rate against the accuracy. Based on the 
optimal_iterations found in the previous hyperparameter tuning, these functions evaluate the range of 
the learning rate, and plot its accuracy for that range.
'''
learning_rates0 =  [0.00001,0.00002,0.00003,0.00004,0.00005,0.00006,0.00007,0.00008,0.00009]
learning_rates1 =  [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009]
learning_rates2 = [0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009]
learning_rates3 = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09]
learning_rates4 = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]


####### plotting learning rate values in the (0.00001's) #######

# accuracy_values0 = []
# for rate in learning_rates1:
#     accuracy = kFoldCrossVal(5, X, y, rate, optimal_iter)
#     accuracy_values0.append(accuracy)

# plt.plot(learning_rates0, accuracy_values0)
# plt.title('Accuracy of Learning rates in 0.0001s ')
# plt.xlabel('Learning Rates')
# plt.ylabel('Accuracy')
# plt.show()

###### plotting learning rate values in the (0.0001's) #######
# accuracy_values1 = []
# for rate in learning_rates1:
#     accuracy = kFoldCrossVal(5, X, y, rate, optimal_iter)
#     accuracy_values1.append(accuracy)

# plt.plot(learning_rates1, accuracy_values1)
# plt.title('Accuracy of Learning rates in 0.0001s ')
# plt.xlabel('Learning Rates')
# plt.ylabel('Accuracy')
# plt.show()

# # ######## plotting learning rate values in the (0.001's) #######
# accuracy_values2 = []
# for rate in learning_rates2:
#     accuracy = kFoldCrossVal(5, X, y, rate, optimal_iter)
#     accuracy_values2.append(accuracy)
    
# plt.plot(learning_rates2, accuracy_values2)
# plt.title('Accuracy of Learning rates in 0.001s ')
# plt.xlabel('Learning Rates')
# plt.ylabel('Accuracy')
# plt.show()


# ######## plotting learning rate values in the (0.01's)#######
# # accuracy_values3 = []    
# # for rate in learning_rates3:
# #     accuracy = kFoldCrossVal(5, X, y, rate, optimal_iter)
# #     accuracy_values3.append(accuracy)

# # plt.plot(learning_rates3, accuracy_values3)
# # plt.title('Accuracy of Learning rates in 0.01s ')
# # plt.xlabel('Learning Rates')
# # plt.ylabel('Accuracy')
# # plt.show()
    

#### plotting accuracy on learning rate values in the (0.1's) #######
# accuracy_values4 = []    
# for rate in learning_rates4:
#     #used 300 as it is the optimal 
#     accuracy = kFoldCrossVal(5, X, y, rate, optimal_iter)
#     accuracy_values4.append(accuracy)

# plt.plot(learning_rates4, accuracy_values4)
# plt.title('Accuracy of Learning rates in 0.1s ')
# plt.xlabel('Learning Rates')
# plt.ylabel('Accuracy')
# plt.show()
