Logistic Regression Machine Learning Classification Algorithm Implementation.

(By : Harshith Shankar Tarikere Ravikumar (19230323) and Shyam Kumar Sodankoor(19230735)) 


In [23]:
import pandas as ps
ps.options.mode.chained_assignment = None
import numpy as np

In [24]:
class preprocessing:
    def __init__(self):
        pass   
    
    '''
    Author: Harshith Shankar Tarikere Ravikumar.
    This function takes the inputs data frame and the train size and returns two data frames X_train and X_test based on the train_size.'''
    def shuffle_split_data(self, X, train_size):
        arr_rand = np.random.rand(X.shape[0])
        split = arr_rand < np.percentile(arr_rand, train_size * 100)

        X_train = X[split]
        X_test =  X[~split]

        return X_train, X_test

    ''' Author: Shyam Kumar Sodankoor.
    This function takes a dataframe shuffle as input and normalizes it based on the max and min values of the whole dataframe and returns the normalized data frame.'''
    def normalize_df(self, df):
        df_norm = (df - df.min()) / (df.max() - df.min())
        return df_norm

    ''' Author: Harshith Shankar Tarikere Ravikumar.
     This function takes the inputs file path, column names as a list, separator, starting index of the features, train data size and transpose required 
     and returns the training and testing data. If the user wants to input the train and test data separately. This can be achieved by setting 
     the train data size as 1.0 and 0 respectively and use the train_data only in first case and test_data only in the test data case. 
     In addition, the file is expected to be in the following format: Set of columns of non-required data (like sample_id) followed by set of features, followed by the labels. 
     It can be in the transposed format as well. '''
    def split_data(self, filePath, column_names, seperator , feature_starting_index, train_data_size=0.67, transpose_required=True):
        df = ps.read_csv(filePath, sep=seperator, header = None)
        if transpose_required:
            df = df.T
            
        df.columns = column_names

        data = df.iloc[:,feature_starting_index:]

        data.iloc[:, 0:-1] = np.float_(data.iloc[:, 0:-1])
        data.iloc[:, 0:-1] = self.normalize_df(data.iloc[:, 0:-1])

        data_train, data_test = self.shuffle_split_data(data, train_data_size)

        return data_train, data_test

In [25]:
class LogisticRegression:
    '''Author: Harshith Shankar Tarikere Ravikumar.'''
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
    
    ''' Author: Harshith Shankar Tarikere Ravikumar.
    This function adds the X-intercept or the bias for the given input and returns the input+intercept. '''
    def add_theta0(self, X):
        theta0 = np.ones((X.shape[0], 1))
        return np.concatenate((theta0, X), axis=1)
    
    ''' Author: Harshith Shankar Tarikere Ravikumar.
    This function calculates and returns the sigmoid for the given input. '''
    def sigmoid_function(self, z):
        return 1 / (1 + np.exp(-z))

    ''' Author: Harshith Shankar Tarikere Ravikumar.
    This function does the preprocessing specific for the model. The input to this function is the dataframe, which consists of only the features and the labels data, 
    and the label names. It creates a dictionary with keys as tuples of different combinations of the classes and values as the data(features+labels) corresponding to 
    the labels in the keys. It also converts one of the labels to 0 and other to 1 for each item in the dictionary.'''
    def data_processing(self, data, label_name):
      import itertools
      self.label_name = label_name
      self.labels = data[label_name]
      self.unique_labels = list(set(self.labels))
      self.classes_list = list(itertools.combinations(self.unique_labels, 2))
      self.classes_dict = {}
      for classes in self.classes_list:
          self.classes_dict[classes] = []
      
      for binclass,vectors in self.classes_dict.items():
          self.classes_dict[binclass] = data[(data[self.label_name] == binclass[0]) | (data[self.label_name] == binclass[1])]
          self.classes_dict[binclass][self.label_name][self.classes_dict[binclass][self.label_name] == binclass[0]] = 0 
          self.classes_dict[binclass][self.label_name][self.classes_dict[binclass][self.label_name] == binclass[1]] = 1

    ''' Author: Harshith Shankar Tarikere Ravikumar.
    The set of features(X) and the corresponding outputs(y) are taken for a set of two classes. 
    X-intercept or the bias is added to this. ϴ is initially set to 0(An array based on the column size of X). 
    Then we go through a loop where we calculate the predicted output(h) based on the sigmoid of given features (X0 + ϴ1X1 + ϴ2X2 + ϴ3X3……… ϴnXn). 
    We calculate the gradient and subtract this from ϴ. This loop is run based on the input(num_iter), which finally gives the optimum ϴ.  

    Whole of the above process is repeated for all the combinations of the classes and stored in a dictionary with keys as tuples of different combinations of the classes 
    and values as the ϴ values of their corresponding computed ϴ values. '''
    def fit(self):
       self.theta_dict = {}
       for classes in self.classes_list:
            self.theta_dict[classes] = []

       for binclass,vectors in self.theta_dict.items():
            X = self.classes_dict[binclass].iloc[:,0:-1]
            y = self.classes_dict[binclass][self.label_name] 

            if self.fit_intercept:
                X = self.add_theta0(X)
        
            # weights initialization
            self.theta = np.zeros(X.shape[1])
        
            for i in range(self.num_iter):
                z = np.array(np.dot(X, self.theta),dtype=np.float32)
                #print(z)
                h = self.sigmoid_function(z)
                gradient = np.dot(X.T, (h - y)) / y.size
                self.theta = self.theta - self.lr * gradient
            
            self.theta_dict[binclass] = self.theta
    
    ''' Author: Shyam Kumar Sodankoor.
      This function takes as input set of features whose label has to be predicted. First, we add the X-intercept to this. 
      Then we compute the dot product of X with the calculated ϴ values and send this to the sigmoid function which returns the predicted class. 
      We do the above step for all the different combinations of classes and store the results in another dictionary(preds_dict)
      with keys as tuples of different combinations of the classes and values as list of predictions for given test samples. 
      Now we have different predicted values for different combinations of classes. We use one vs one approach to get the exact prediction for each of the test data. '''
    def predict(self, X):
        self.preds_dict = {}
        self.new_dict = {}
        for classes in self.classes_list:
            self.preds_dict[classes] = []
            self.new_dict[classes] = []

        if self.fit_intercept:
                X = self.add_theta0(X)

        for binclass,vectors in self.theta_dict.items():
            preds_prob = self.sigmoid_function(np.array(np.dot(X, vectors),dtype=np.float32))
            preds = preds_prob.round()
            self.preds_dict[binclass] = preds
            self.new_dict[binclass] = preds_prob
        
        predictions = self.oneVsone()
        return predictions
    
    ''' Author: Shyam Kumar Sodankoor.
    This function basically checks all the values of the preds_dict and returns the class which was predicted maximum number of times for a particular sample(voting). 
    There is also additional logic in case two or more classes have the same number of votes, where we check the probability result for the combinations of the classes 
    having same votes and we result the class having maximum probability. '''
    def oneVsone(self):
        import operator
        import itertools
        outputArray = []
        for binclass,vectors in self.preds_dict.items():
            vectors1 = [binclass[0] if item == 0.0 else binclass[1] for item in vectors]
            outputArray.append(vectors1)

        outputDict = {}
        k=[]
        predictions = []

        for i in range(0,len(outputArray[0])):
            for classes in self.unique_labels:
                outputDict[classes] = 0
            for j in range(0, len(self.classes_list)):
                outputDict[outputArray[j][i]] = outputDict[outputArray[j][i]] + 1
    
            max_vote = max(outputDict.values())

            if max_vote <= len(self.unique_labels)/2:
                k = [k for k,v in outputDict.items() if v==max_vote]
                classes_list1 = list(itertools.combinations(k, 2))
                minimum_diff = 0
                minimum = 1
                class_tuple = ()
                for tuples in classes_list1:
                    if self.new_dict[tuples][i] > 0.5:
                        minimum_diff = 1 - self.new_dict[tuples][i]
                    else:
                        minimum_diff = self.new_dict[tuples][i]
                if minimum_diff<minimum:
                    minimum = minimum_diff
                    class_tuple = tuples
                if self.new_dict[class_tuple][i]>0.5:
                    pred_class = class_tuple[1]
                else:
                    pred_class = class_tuple[0]
            else:
                pred_class = max(outputDict.items(), key=operator.itemgetter(1))[0]
    
            pred_class = max(outputDict.items(), key=operator.itemgetter(1))[0]
            predictions.append(pred_class)

        return predictions
    
    ''' Author: Shyam Kumar Sodankoor.
    Takes inputs the predicted labels(returned from predict) and the actual labels(test data) and returns the percentage of labels properly classified in the test data. '''
    def accuracy(self, predicted_labels, actual_labels):
        return (predicted_labels == actual_labels).mean()

In [26]:
'''creating Logistic Regression object called 'model' with lr = 0.1, num_iter = 300 '''
model = LogisticRegression(lr=0.1, num_iter=300)

'''creating preprocessing object called 'prepro' '''
prepro = preprocessing()

In [27]:
model_prediction = []
'''
Training and predicting an accuracy for 10 different random divisions and taking average on those 10 predicted accuracy.
'''
for i in range(0,10):
      columns = ["sample_id", "length", "width", "thickness", "surface_area", "mass", "compactness", "hardness", "shell_top_radius", "water_content", "carbohydrate_content", "variety"]
      data_train, data_test = prepro.split_data(filePath="/ hazelnuts.txt", column_names=columns,seperator="\t" ,feature_starting_index=1, train_data_size=0.67, transpose_required=True)
      X_train = data_train.iloc[:, 0:-1]
      X_test = data_test.iloc[:, 0:-1]
      y_train = data_train.iloc[:,-1]
      y_test = data_test.iloc[:,-1]

      model.data_processing(data_train, label_name = "variety")
      model.fit()
      preds_dict = model.predict(X_test)
      model_prediction.append(model.accuracy(preds_dict,y_test))
      
print("Logistic Regression repeated with 10 different random divisions of data : ",model_prediction)
print("Logistic Regression with an average accuracy  : ",np.mean(model_prediction))

FileNotFoundError: [Errno 2] File b'/hazelnuts.txt' does not exist: b'/hazelnuts.txt'

In [None]:
# confustion matrix
from  sklearn.metrics import classification_report

In [None]:
# printing classification report
print(classification_report(y_test, preds_dict))