# Supervised learning algorithms

In this document I am going to build an easy practice dataset, and practice classification using different supervised learning algorithms.

Things that I want to implement here:
- build easy dataset myself -- Check :)
- use sklearn to split train and test data (60, 20, 20) -- Check :)
- use the k-fold cross validation tactic to verify the statistical robustness of my algorithm's performance
- test perfromance

- implement k-nearest neighbour classification -- Check :)
- implement a decision tree
- implement a neural network
- implement a bayes classifier

Well, let's get started. Wish me luck and endurance! :)


In [24]:
# import packages
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [25]:
# Build mock dataset

feature1 = [1, 2,3, 2, 3, 3, 3, 3,  4, 5, 5, 6, 6, 6, 6, 7, 8, 8,8, 8,8, 8, 9, 9, 9]
feature2 = [7, 2,5, 3, 1, 1, 4, 3,  3, 2, 6, 2, 5, 7, 4, 8, 5, 1, 4, 9,6, 7, 8, 7, 9]
outcome = ["sunflower", "tulip","tulip","tulip","tulip","sunflower", "sunflower","tulip","tulip","sunflower" ,"tulip", "sunflower","tulip", "sunflower","cactus", "sunflower", "cactus","cactus","sunflower", "sunflower", "cactus","sunflower","cactus", "cactus","cactus"]

mock_data = {}

mock_data["feature1"] = feature1
mock_data["feature2"] = feature2
mock_data["outcome"] = outcome


mock_data = pd.DataFrame(mock_data)

In [26]:
# divide data into input features (y) and output criterion (X)

y = mock_data.loc[:, mock_data.columns != 'outcome']
X = mock_data.loc[:,'outcome']

In [27]:
# get seperate train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


# split the test data again for final test vs cross validation tests
X_test_for_cross_validation, X_final_test, y_test_for_cross_validation, y_final_test = train_test_split(X_test, y_test, test_size=0.5)


# K-nearest neighbour algorithm

input needed:
- k (how many nearest neighbours) (either given or optimized within algorithm)
- old data training set including categories, new data point without category

pseudo-code:

    def KNearestNeighbourClassifier(train_data, k, test_data):
        for each row in train_data:
            calc distance from test_data
        get min distance
        get categories of tain_data that has min distance
        get most numerous category of tain_data that has min distance
        return(category)

In [28]:
# training data is need with features and outcomes:
training_data = y_train.join(X_train)
training_data
test_data = [12,10]


In [10]:
# define class
class KNearestNeighbourClassifier:

    # constructor
    def __init__(self, trainDataDF, k = None, testDataDF = None, trueLabelsTestDataDF = None):
        self.trainDataDF = trainDataDF
        self.testDataDF = testDataDF
        self.trueLabelsTestDataDF = trueLabelsTestDataDF
        if k==None:
            self.k = self.optimizeK()
        else:
            self.k = k

    def __str__(self):
        return "K Nearest Neighbour Classifier Object"


    # define methods

    def optimizeK(self):
        accuracyForEachK = []
        # loop over all possible k
        for potentialK in range(1,len(self.trainDataDF)+1):
            listOfPredictions = []
            # in each k loop, loop over all test data cases, build a list with the predictions for each case, given the k
            for index, row in self.testDataDF.iterrows():
                functionInput = [row[0], row[1]]
                category = self.predict(functionInput, potentialK)
                listOfPredictions.append(category)
            # calculate the accuracy for each k, save it in a list
            comparison = self.trueLabelsTestDataDF == listOfPredictions
            correctClassifiedCounter= 0
            for item in comparison:
                if item == True:
                    correctClassifiedCounter+=1
            accuracyForEachK.append(correctClassifiedCounter/len(listOfPredictions))
        # choose the key that optimzes the accuracy
        optimizedK = accuracyForEachK.index(max(accuracyForEachK))+1
                
        return(optimizedK)
    

    def predict(self, testDataList, k = None):
        """
        function that returns the predicted/classified category for a new datapoint using old datapoints that have categories, using k-NN logic.

        Parameters
        ---------------
        trainDataDF: pd. Data frame with two input features and one output category
        k: number of closest datapoint that the category should be dirived from
        testDataList: list with two integers (that represent the feature 1 and 2 attribute)

        Returns
        ---------------
        category: string with predicted category
            
        """
    
        # initiate dict and two list to hold information, which we will need to save for further processes
        dictToSaveDistances = {}
        kMinIndexes = []
        categoriesOfMinDistanceEntries = []

        # calculate distance (helper function) from every old data point to new datapoint and save distances as values in dictionary where their index is the key (to identify later)
        for index, row in self.trainDataDF.iterrows():
            dictToSaveDistances[index] = self.__getEuclideanDistance(row[0], row[1], testDataList[0], testDataList[1])

        # Either use the k passed to this function, or the one prepared when instantiating a classifier
        chosenK = [self.k if k == None else k][0]

        # identify k min distances and save indexes of those in a list 
        for i in range(chosenK):
            MinKey = min(dictToSaveDistances, key = dictToSaveDistances.get)
            kMinIndexes.append(MinKey)
            del dictToSaveDistances[MinKey]

        # for each of the indexes of k minimal distances, retrieve category and save in list
        for i in kMinIndexes:
            categoriesOfMinDistanceEntries.append(self.trainDataDF["outcome"].loc[i])
        
        # retrieve most common category from k min distance cases, and return it as predicted category
        category = max(categoriesOfMinDistanceEntries,key=categoriesOfMinDistanceEntries.count)
        return(category)
    
    # helper function: calculate euclidean distance with 2 coordinates of the train and two coordinates of the test data 
    ### todo: can I generalize the function to take in any input coordinate length?!
    @staticmethod
    def __getEuclideanDistance(trainData1Coordinate, trainData2Coordinate, testData1Coordinate, testData2Coordinate):
        distance = math.sqrt((trainData1Coordinate-testData1Coordinate)**2+(trainData2Coordinate-testData2Coordinate)**2)
        return(distance)



# test
model1 = KNearestNeighbourClassifier(trainDataDF = training_data, testDataDF = y_test_for_cross_validation, trueLabelsTestDataDF = X_test_for_cross_validation)
model2 = KNearestNeighbourClassifier(trainDataDF = training_data, k=15)

test_data_1 = [0, 0]
test_data_2 = [12,10]
print(model1.predict(test_data_1))
print(model2.predict(test_data_1))

tulip
sunflower


# Decision Tree Algorithm

input needed:
- best: categorical data
- old data training set including categories, new data point without category

pseudo-code:


In [22]:
# Build mock dataset

categoricalFeature1 = ["book", "book","movie", "movie","movie","movie","movie","movie","book","movie","book","book","movie","book","book","book","movie","sleep","book","movie","sleep","book","sleep","sleep","book",]
categoricalFeature2 = ["male", "female", "female", "female", "male","male", "female", "female", "male", "female","male", "female","female",  "male", "female",  "female", "male", "female", "female", "male", "female", "male", "female", "female", "female"]
outcome = ["sunflower", "tulip","tulip","tulip","tulip","sunflower", "sunflower","tulip","tulip","sunflower" ,"tulip", "sunflower","tulip", "sunflower","cactus", "sunflower", "cactus","cactus","sunflower", "sunflower", "cactus","sunflower","cactus", "cactus","cactus"]

mock_data = {}

mock_data["categoricalFeature1"] = categoricalFeature1
mock_data["categoricalFeature2"] = categoricalFeature2
mock_data["outcome"] = outcome


mock_data = pd.DataFrame(mock_data)


# divide data into input features (y) and output criterion (X)

y = mock_data.loc[:, mock_data.columns != 'outcome']
X = mock_data.loc[:,'outcome']

# get seperate train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


# split the test data again for final test vs cross validation tests
X_test_for_cross_validation, X_final_test, y_test_for_cross_validation, y_final_test = train_test_split(X_test, y_test, test_size=0.5)


In [23]:
# training data is need with features and outcomes:
training_data = y_train.join(X_train)
training_data
#test_data = [12,10]

Unnamed: 0,categoricalFeature1,categoricalFeature2,outcome
14,book,female,cactus
7,movie,female,tulip
23,sleep,female,cactus
24,book,female,cactus
18,book,female,sunflower
6,movie,female,sunflower
11,book,female,sunflower
0,book,male,sunflower
4,movie,male,tulip
21,book,male,sunflower


# Old programming stuff

In [None]:
# stuff from kNN classifier


def EuclideanDistance(int1, int2):
    distance = math.sqrt((int1-int2)**2)
    return(distance)


# helper function: calculate euclidean distance with 2 coordinates of the train and two coordinates of the test data 
### todo: can I generalize the function to take in any input coordinate length?!
def getEuclideanDistance(trainData1Coordinate, trainData2Coordinate, testData1Coordinate, testData2Coordinate):
    distance = math.sqrt((trainData1Coordinate-testData1Coordinate)**2+(trainData2Coordinate-testData2Coordinate)**2)
    return(distance)


def kNearestNeighbourClassifier(trainDataDF, k, testDataList):
    """
        function that returns the predicted/classified category for a new datapoint using old datapoints that have categories, using k-NN logic.

        Parameters
        ---------------
        trainDataDF: pd. Data frame with two input features and one output category
        k: number of closest datapoint that the category should be dirived from
        testDataList: list with two integers (that represent the feature 1 and 2 attribute)

        Returns
        ---------------
        category: string with predicted category
            
        """
    
    # initiate dict and two list to hold information, which we will need to save for further processes
    dictToSaveDistances = {}
    kMinIndexes = []
    categoriesOfMinDistanceEntries = []

    # calculate distance (helper function) from every old data point to new datapoint and save distances as values in dictionary where their index is the key (to identify later)
    for index, row in trainDataDF.iterrows():
        dictToSaveDistances[index] = getEuclideanDistance(row[0], row[1], testDataList[0], testDataList[1])

    # identify k min distances and save indexes of those in a list 
    for i in range(k):
       MinKey = min(dictToSaveDistances, key = dictToSaveDistances.get)
       kMinIndexes.append(MinKey)
       del dictToSaveDistances[MinKey]

    # for each of the indexes of k minimal distances, retrieve category and save in list
    for i in kMinIndexes:
       categoriesOfMinDistanceEntries.append(trainDataDF["outcome"].loc[i])
    
    # retrieve most common category from k min distance cases, and return it as predicted category
    category = max(categoriesOfMinDistanceEntries,key=categoriesOfMinDistanceEntries.count)
    return(category)


# how to determine an optimized k

def optimizeK(trainDataDF, testDataDF, trueLabelsTestDataDF):
    accuracyForEachK = []
    # loop over all possible k
    for k in range(1,len(trainDataDF)+1):
        listOfPredictions = []
        # in each k loop, loop over all test data cases, build a list with the predictions for each case, given the k
        for index, row in testDataDF.iterrows():
            functionInput = [row[0], row[1]]
            category = kNearestNeighbourClassifier(trainDataDF, k, functionInput)
            listOfPredictions.append(category)
        # calculate the accuracy for each k, save it in a list
        comparison = trueLabelsTestDataDF == listOfPredictions
        correctClassifiedCounter= 0
        for item in comparison:
            if item == True:
                correctClassifiedCounter+=1
        accuracyForEachK.append(correctClassifiedCounter/len(listOfPredictions))
    # choose the key that optimzes the accuracy
    optimizedK = accuracyForEachK.index(max(accuracyForEachK))+1
            
    return(optimizedK)
