In [1]:
import pandas as pd
import numpy as np
import math
import itertools

In [2]:
#class that contains all the tools to run both binary and multinomial logistic regressions
class LogisticRegression():
    
    #initializes whether classes are binary or not, which preprocessing we would like to do
    #what our learning rate will be, and how many iterations
    def __init__ (self, binary = True, lr_preprocess=None, learning_rate = 0.01, iterations = 100):
        self.learning_rate = learning_rate    
        self.iterations = iterations
        self.binary = binary
        self.lr_preprocess = lr_preprocess
    
    #function that preprocesses features by standardizing or normalizing them and returns the preprocessed features
    #if is None, just return the raw features
    def preprocess_features(self, x):
        if self.lr_preprocess == 'standardize':
            return (x-np.mean(x,axis=0))/np.std(x,axis= 0)
        elif self.lr_preprocess == 'normalize':
            return (((x-np.min(x, axis=0))/(np.max(x, axis=0)-np.min(x, axis=0)))*-1)+1
        else:
            return x
    
    #this calculates and returns the current outputs of our linear function given the current weight and bias
    #used for binary classification
    def calc_linear_function(self, x):
        return np.dot(x, self.weight) + self.bias
    
    #this calculates and returns the current outputs of our linear functions for each class given the current weight and bias
    #used for multinomial/multiclass
    def calc_multiclass_linear_functions(self, x):
        return np.dot(x, self.weights.T) + self.biases.T
    
    #this function transforms our labels into a one hot encoded version of the labels, it returns this as the new y labels
    #this is used for multinomial/multiclass
    def one_hot(self, y, classes):
        one_hot_array = np.zeros(shape= (len(y), len(classes)))
        for index, y_val in enumerate(y):
            one_hot_array[index][np.argwhere(classes==y_val)]=1
        return one_hot_array
    
    #we use the the softmax function to estimate the probabilities of a sample being part of a certain class for multinomial/multiclass problems 
    def softmax(self, linear_function):
        return np.exp(linear_function)/np.sum(np.exp(linear_function),axis=1).reshape(-1,1)
    
    #we use the sigmoid function to estimate the probabilities of a sample being part of a certain class for binary classification
    def sigmoid(self, linear_function):
        return 1/(1+ np.exp(-linear_function))
    
    #this is the fit function for logistic regression. it accepts the training data and calculates the optimal weights and biases given the labels
    #first the data is preprocessed when set to do so. then we initialize the weights... for multiclass we one hot encode the labels and initialize weights
    #for each of the classes. next we iterate the set number of times, calculate the linear function(s), the predicted y values, the weight and bias
    #gradients, and update the weights by subtracting the gradient*learning rate.
    #we find the optimal weight using gradient descent on the sigmoid function for binary and softmax function for multiclass
    def fit(self, x, y):
        sample_count, feature_count = x.shape
        x = self.preprocess_features(x)
        if self.binary:
            self.weight = np.random.uniform(low=-0.01, high=0.01, size=feature_count)#np.zeros(feature_count)
            self.bias = np.random.uniform (low=-0.01, high=0.01)#0
        else:
            classes = np.unique(y)
            y = self.one_hot(y, classes)
            self.weights = np.zeros(shape = (len(classes), feature_count))
            self.biases = np.zeros(len(classes))
        for run in range(self.iterations):
            if self.binary:
                linear_function = self.calc_linear_function(x)
                y_predict = self.sigmoid(linear_function)
                weight_gradient = np.dot(x.T, y_predict - y)#*(1/sample_count)
                bias_gradient = np.sum(y_predict - y)#/sample_count
                self.weight -= weight_gradient*self.learning_rate
                self.bias -= bias_gradient*self.learning_rate
            else:
                linear_functions = self.calc_multiclass_linear_functions(x)
                y_predict = self.softmax(linear_functions)
                weight_gradient = np.dot((y_predict - y).T, x)#*(1/sample_count)
                bias_gradient = np.sum((y_predict - y).T)#/sample_count
                self.weights -= weight_gradient*self.learning_rate
                self.biases -= bias_gradient*self.learning_rate

    #this is the prediction method for logistic regression. we follow the same guidelines for calculating the predicted y value as the fit method
    #using sigmoid for binary and softmax for multiclass. for binary, we predict 1 if the predicted value is greater than 0.5, 0 else.
    #for multiclass we take the argmax of the predicted vals and set that as our predicted class
    def predict(self, x):
        x = self.preprocess_features(x)
        if self.binary:
            linear_function = self.calc_linear_function(x)
            y_predict = self.sigmoid(linear_function)
            return [1 if y > 0.5 else 0 for y in y_predict]
        else:
            linear_functions = self.calc_multiclass_linear_functions(x)
            y_predict = self.softmax(linear_functions)
            return np.argmax(y_predict, axis =1)

In [3]:
#class that contains all the tools to run both binary and multinomial adalines
class Adaline():
    
    #initializes whether classes are binary or not, which preprocessing we would like to do
    #what our learning rate will be, and how many iterations    
    def __init__(self, binary=True, adaline_preprocess=None, learning_rate = 0.01, iterations = 100):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weight = None
        self.bias = None
        self.weight_temp = None
        self.bias_temp = None
        self.weights = []
        self.biases = []
        self.classes = None
        self.binary = binary
        self.adaline_preprocess = adaline_preprocess
        
    #function that preprocesses features by standardizing or normalizing them and returns the preprocessed features
    #if adaline_preprocess is None, just return the raw features
    def preprocess_features(self, x):
        if self.adaline_preprocess == 'standardize':
            return (x-np.mean(x,axis=0))/np.std(x,axis= 0)
        elif self.adaline_preprocess == 'normalize':
            return (((x-np.min(x, axis=0))/(np.max(x, axis=0)-np.min(x, axis=0)))*-1)+1
        else:
            return x

    #this calculates and returns the current outputs of our linear function given the current weight and bias
    #used for binary classification
    def calc_linear_function(self, x):
        return np.dot(x, self.weight) + self.bias

    #the activation function of adaline is linear so we don't alter it in any way
    def activation_function(self, z):
        return z
    
    #this function is used to calculate the cost of each prediction, we want this to decrease over iterations
    def cost_function(self, y, y_predict):
        cost = np.sum((y_predict-y)**2)/2.0
        return cost
    
    #this is the fit function for adaline. it accepts the training data and calculates the optimal weights and biases given the labels
    #first the data is preprocessed when set to do so. then we initialize the weights to some random number close to 0. next we iterate 
    #the set number of times, for binary, we calculate the linear function(s), put the linear function through an activation function 
    #which just returns the same thing for adaline, calculate the predicted y values, calculate the weight and bias
    #gradients, and update the weights and bias by subtracting the gradient*learning rate. for multiclass we use a one-v-all
    #approach, relabeling the data so one class is 1 and the rest are 0, and updating the weights and biases for each of the classes for each iteration
    #to find the optimal weights and bias we use gradient descent to minimize the sum of squared errors cost function
    def fit(self, x, y):
        sample_count, feature_count = x.shape
        weight_master = list(np.random.uniform(low=-0.01, high=0.01, size=feature_count)).copy()
        bias_master =np.random.uniform (low=-0.01, high=0.01)
        self.weight = weight_master 
        self.bias = bias_master
        x = self.preprocess_features(x)
        
        #print(x)
        if self.binary:
            for run in range(self.iterations):
                linear_function = self.calc_linear_function(x)
                y_predict = self.activation_function(linear_function)
                weight_gradient = np.dot(x.T, y_predict-y)
                bias_gradient = (y_predict-y).sum()
                self.weight -= weight_gradient*self.learning_rate
                self.bias -= bias_gradient*self.learning_rate
        else:
            self.classes = np.unique(y)
            for class_val in self.classes:
                self.weight = weight_master
                self.bias = bias_master
                y_temp = [1 if y_val==class_val else 0 for y_val in y]
                for run in range(self.iterations):
                    linear_function = self.calc_linear_function(x)
                    y_predict = self.activation_function(linear_function)
                    sampl = np.random.randint(0, len(y))
                    weight_gradient = x.T.dot(y_predict-y_temp)
                    bias_gradient = np.sum(y_predict-y_temp)
                    self.weight -= weight_gradient*self.learning_rate
                    self.bias -= bias_gradient*self.learning_rate
                self.weights.append(self.weight)
                self.biases.append(self.bias)
                
    #this is the prediction method for adaline. we follow the same guidelines for calculating the predicted y value as the fit method
    #for binary, we predict 1 if the predicted value is greater than 0, 0 else.
    #for multiclass we take the argmax of the y_predicts which act as conand set that as our predicted class.
    def predict(self, x):
        onevall_predictions = []
        predictions = []
        x = self.preprocess_features(x)
        if self.binary:
            linear_function = self.calc_linear_function(x)
            y_predict = self.activation_function(linear_function)
            return [1 if y_val >=0 else 0 for y_val in y_predict]
        else:
            for index, class_val in enumerate(self.classes):
                self.weight = self.weights[index]
                self.bias = self.biases[index]
                linear_function = self.calc_linear_function(x)
                y_predict = self.activation_function(linear_function)
                onevall_predictions.append(y_predict)
            for sample in range(len(onevall_predictions[0])):
                class_confidence_scores = [onevall_predictions[index][sample] for index in range(len(self.classes))]
                predictions.append(self.classes[np.argmax(class_confidence_scores)])
            return predictions

In [4]:
#imputes the data by calculatig the mean given a class and replacing all values in that class
#that have a feature of "?" with the conditional mean. We round the mean to 0 or 1 when the data
#is categorical
def impute (data, categorical):
    class_dfs = []
    classes = np.unique(data['target'])
    #goes through each label to calculate the conditional mean
    for label in classes:
        #change the "?" to nan so we can compute the mean skipping the nans
        data_with_label = data[data['target']==label].replace('?', np.nan).astype('float')
        means = data_with_label.mean(skipna=True)
        #goes through each feature to replace empty vals with conditional means
        #rounds the mean if the data is categorical
        for feature in data.columns:
            if categorical:
                replacement_val = round(means[feature])
            else:
                replacement_val = means[feature]
            data_with_label[feature] = data_with_label[feature].apply(lambda x: replacement_val if math.isnan(x) else x)
        class_dfs.append(data_with_label)
    return pd.concat(class_dfs).reset_index().drop(columns='index')
#split the data into 5 different sets and return each set
def validation_sets(data):
    sets = []
    sample_size = len(data)
    class_splits = []
    for class_val in np.unique(data['target']):
        df_class = data[data['target'] == class_val]
        dfs_class = np.array_split(df_class, 5)
        class_splits.append(dfs_class)
    for index in range(0, 5):
        sample_set_list = []
        for class_index in range(len(class_splits)):
            sample_set_list.append(class_splits[class_index][index])
        sample_set = pd.concat(sample_set_list)
        sets.append(sample_set.reset_index().drop(columns= 'index'))
    return sets

#this is the shell for all our models. it takes each set and tests it against the rest of the sets and returns the accuracy
#to evaluate our model. it also allows for  different preprocessing, learning rates and number of iterations
def k_fold_cross_validation(sets, binary, lr_preprocess, adaline_preprocess, lr_learning_rate, lr_iterations, adaline_learning_rate, adaline_iterations):
    lr_hyperparamaters = list(itertools.product(*[lr_learning_rate, lr_iterations]))
    lr_scores = [ [] for i in range(len(lr_hyperparamaters)) ]
    adaline_hyperparamaters = list(itertools.product(*[adaline_learning_rate, adaline_iterations]))
    adaline_scores = [ [] for i in range(len(adaline_hyperparamaters)) ]
    for index in range(0, len(sets)):
        test_set = sets[index]
        #concats the rest of the sets for training
        training_set = pd.concat([t_set for (set_index, t_set) in enumerate(sets) if set_index!=index])
        y_train = np.array(training_set['target'])
        #drop target so its not considered a feature
        x_train = training_set.reset_index().drop(columns= ['index','target']).values

        y_test = np.array(test_set['target'])
        x_test = test_set.reset_index().drop(columns= ['index','target']).values
        for index, hyperparamaters in enumerate(lr_hyperparamaters):
            lr = LogisticRegression(binary, lr_preprocess=lr_preprocess, learning_rate = hyperparamaters[0], iterations = hyperparamaters[1])
            lr.fit(x_train, y_train)
            y_predict = lr.predict(x_test)
            results = [y_predict[result_index]==actual for result_index, actual in enumerate(y_test)]
            lr_scores[index].append(results.count(True)/len(results))
        for index, hyperparamaters in enumerate(adaline_hyperparamaters):
            adaline = Adaline(binary, adaline_preprocess=adaline_preprocess, learning_rate = hyperparamaters[0], iterations = hyperparamaters[1])
            adaline.fit(x_train, y_train)
            y_predict = adaline.predict(x_test)
            results = [y_predict[result_index]==actual for result_index, actual in enumerate(y_test)]
            adaline_scores[index].append(results.count(True)/len(results))
    print('logistic regression accuracy:')
    for index, hyperparamaters in enumerate(lr_hyperparamaters):
        print(f'learning_rate = {hyperparamaters[0]} iterations = {hyperparamaters[1]}: {np.mean(lr_scores[index])}')
    print('adaline accuracy:')
    for index, hyperparamaters in enumerate(adaline_hyperparamaters):
        print(f'learning_rate = {hyperparamaters[0]} iterations = {hyperparamaters[1]}: {np.mean(adaline_scores[index])}')

## Breast Cancer

In [5]:
#importing breast cancer data and removing the id column
#"Missing Attribute Values: Denoted by "?"" so we need to impute those
breast_cancer = pd.read_csv('breast-cancer-wisconsin.data', sep=",", header=None)
breast_cancer.columns = ["id", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion",
               "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "target"]
breast_cancer = breast_cancer.drop(columns='id')

In [6]:
#our data labels 2 for benign and 4 for malignant, so let's relabel this to 0 and 1 respectively
breast_cancer['target'] = breast_cancer['target'].replace(2, 0).replace(4, 1)

#the missing attributes are numerical, so we impute the data by 
#finding the mean for that meature given the class
breast_cancer = impute(breast_cancer, False)

In [7]:
#'normalize', 'standardize', None
sets = validation_sets(breast_cancer)
k_fold_cross_validation(sets, binary = True, lr_preprocess = 'standardize', adaline_preprocess= 'standardize', lr_learning_rate = [0.001], lr_iterations = [50], adaline_learning_rate = [0.1], adaline_iterations = [25])


logistic regression accuracy:
learning_rate = 0.001 iterations = 50: 0.9685808313835255
adaline accuracy:
learning_rate = 0.1 iterations = 25: 0.9685502904667148


## House Votes

In [8]:
#importing house vote data
#Missing Attribute Values: Denoted by "?"
house_vote = pd.read_csv('house-votes-84.data', sep=",", header=None)
house_vote.columns = ["target", "handicapped-infants", "water-project-cost-sharing", "adoption-of-the-budget-resolution", "physician-fee-freeze",
               "el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban", "aid-to-nicaraguan-contras", "mx-missile", "immigration",
               "synfuels-corporation-cutback", "education-spending", "superfund-right-to-sue", "crime", "duty-free-exports", "export-administration-act-south-africa"]

In [9]:
#our labels are democrat and republican, so let's relabel them to 0 and 1 respectively
house_vote['target'] = house_vote['target'].replace('democrat', 0).replace('republican', 1)

#labels are represented as "y" or "n", so we change this to "1" and "0" respectively
house_vote = house_vote.replace("y", 1).replace("n", 0)
#we also have some "?" for missing data. Given that this data is categorical, let's take the mean given the class and round it to 0 or 1 for imputation
house_vote = impute(house_vote, True)

In [10]:
#'normalize', 'standardize', None
sets = validation_sets(house_vote)
k_fold_cross_validation(sets, binary = True, lr_preprocess = None, adaline_preprocess= None, lr_learning_rate = [0.01], lr_iterations = [250], adaline_learning_rate = [0.001], adaline_iterations = [10])


logistic regression accuracy:
learning_rate = 0.01 iterations = 250: 0.9700323199922238
adaline accuracy:
learning_rate = 0.001 iterations = 10: 0.9265078612913417


## Iris

In [11]:
#importing iris data
iris = pd.read_csv('iris.data', sep=",", header=None)
iris.columns = ["sepal length in cm", "sepal width in cm", "petal length in cm", "petal width in cm", "target"]

In [12]:
iris['target'] = iris['target'].replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 2)

In [13]:
#'normalize', 'standardize', None
sets = validation_sets(iris)
k_fold_cross_validation(sets, binary = False, lr_preprocess = None, adaline_preprocess= 'normalize', lr_learning_rate = [0.001], lr_iterations = [250], adaline_learning_rate = [0.001], adaline_iterations = [20])


logistic regression accuracy:
learning_rate = 0.001 iterations = 250: 0.9733333333333334
adaline accuracy:
learning_rate = 0.001 iterations = 20: 0.8800000000000001


## Soybean (small)

In [14]:
soybean = pd.read_csv('soybean-small.data', sep=",", header=None)

In [15]:
soybean.columns = ['date','plant-stand','precip','temp','hail','crop-hist','area-damaged','severity','seed-tmt','germination','plant-growth','leaves','leafspots-halo','leafspots-marg','leafspot-size','leaf-shread','leaf-malf','leaf-mild','stem','lodging','stem-cankers','canker-lesion','fruiting-bodies','external decay',
'mycelium','int-discolor','sclerotia','fruit-pods','fruit spots','seed','mold-growth','seed-discolor','seed-size','shriveling','roots', 'target']


In [16]:
soybean['target'] = soybean['target'].replace('D1', 0).replace('D2', 1).replace('D3', 2).replace('D4', 3)

In [17]:
soybean = impute(soybean, True)

In [18]:
soybean_new = pd.get_dummies(soybean.drop(columns = 'target'), columns =  ['date','plant-stand','precip','temp','hail','crop-hist','area-damaged','severity','seed-tmt','germination','plant-growth','leaves','leafspots-halo','leafspots-marg','leafspot-size','leaf-shread','leaf-malf','leaf-mild','stem','lodging','stem-cankers','canker-lesion','fruiting-bodies','external decay',
'mycelium','int-discolor','sclerotia','fruit-pods','fruit spots','seed','mold-growth','seed-discolor','seed-size','shriveling','roots'])

In [19]:
soybean_new['target'] = soybean['target']

In [20]:
#'normalize', 'standardize', None
sets = validation_sets(soybean_new)
k_fold_cross_validation(sets, binary = False, lr_preprocess = None, adaline_preprocess= None, lr_learning_rate = [0.1], lr_iterations = [50], adaline_learning_rate = [ 0.001], adaline_iterations = [250])


logistic regression accuracy:
learning_rate = 0.1 iterations = 50: 1.0
adaline accuracy:
learning_rate = 0.001 iterations = 250: 1.0


## Glass

In [21]:
glass = pd.read_csv('glass.data', sep=",", header=None)

In [22]:
glass.columns = ['id', 'ri', 'na', 'mg', 'al', 'si', 'k', 'ca', 'ba', 'fe', 'target']

In [23]:
glass = glass.drop(columns = 'id')
glass['target'] = glass['target'].replace(7, 0)

In [24]:
#'normalize', 'standardize', None
sets = validation_sets(glass)
k_fold_cross_validation(sets, binary = False, lr_preprocess = 'standardize', adaline_preprocess= 'standardize', lr_learning_rate = [0.1], lr_iterations = [20], adaline_learning_rate = [0.001], adaline_iterations = [500])


logistic regression accuracy:
learning_rate = 0.1 iterations = 20: 0.5534006174703849
adaline accuracy:
learning_rate = 0.001 iterations = 500: 0.49866069331185614
