In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [None]:
# import data
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['target'] = iris.target

In [None]:
# One Hot Encoder
def oneHot(y):
    """
    Converts the training data into a series of ones and zeros for the classes given.
    """
    yEncoded = np.zeros((y.size, y.max()+1))
    yEncoded[np.arange(y.size),y] = 1
    return yEncoded

In [None]:
class StandarScaler:
    """ 
    Standardize features by removing the mean and scaling to unit variance. 
    z = (x - MEAN) / DESV EST 
    """
    def __init__(self):
        pass

    def fit(self, X):
        X = pd.DataFrame(X)
        self.mean = X.mean(axis = 0).to_numpy()
        self.std = X.std(axis = 0).to_numpy()
        
    def transform(self, X):
        X -= self.mean
        X /= self.std
        return X 
    
    def fitTransform(self, X):
        self.fit(X)
        df = self.transform(X)
        return df 

In [None]:
# Split a dataset into a train and test set
def splitTrainTest(dataset, split=0.60):
    """ 
    Separating the dataset into 2 parts: Training Dataset (to train the model) & Test Dataset (to evaluate the performance of the model)
    The rows assigned to each dataset are randomly selected (to ensure that the model is objective).
    randrange() generate a random integer in the range between 0 and the size of the list.

    Parameters:
        dataset: The dataset to split as a list of lists
        split: Split percentage. (default split = 60%) --> A 60/40 for train/test
        
    Returns:
        train: 60% of the dataset
        test: The rows that remain in the copy of the dataset are then returned as the test dataset. (40%)
    """

    #calculate how many rows the training set requires
    train = pd.DataFrame()
    trainSize = split * len(dataset)
    datasetCopy = dataset.copy()

    #add index column
    datasetCopy.reset_index(inplace=True)
    datasetCopy = datasetCopy.rename(columns={"index": "index"})

    idxRan = len(datasetCopy)
    while len(train) < trainSize: # while until the train dataset contains the target number of rows.
        randomIndex = np.random.choice(datasetCopy.index, 1, replace=False) #select random rows
        datasetCopy = datasetCopy.drop(datasetCopy["index"][randomIndex]) #remove random rows from the datasetCopy
        train = pd.concat([train, dataset.loc[randomIndex]]) #add rows to train dataset
        idxRan = idxRan - 1
    
    return train, datasetCopy

In [4]:
#loss
def lossFunction(probs, target):
        """
        Calculates cross entropy loss for a set of predictions and actual targets.
        Cross-entropy is a measure of the difference between two probability distributions for a given random variable

        Parameters:
        probs: Probability predictions in MultiLogisticReg
        target: Actual target values

        Returns:
        loss: Average cross entropy loss
        """

        if target == 1:
            return -np.log(probs)
        else:
            return -np.log(1 - probs)


def lossFunc(y, yG):
    """
    Calculate cross-entropy loss
    The loss increases as the predicted probability diverge from the actual label.

    Parameters:
    yG:
    y:

    Returns:
    loss: Average cross entropy loss
    """

    # Y must be one-hot encoded
    rows = yG.shape[0]
    loss =- np.sum(y * np.log(yG))

    return loss/float(rows)


y = np.array([0,0,1]) 
yG = np.array([0.1,0.1,0.8])

print(lossFunc(y, yG))

0.07438118377140324


In [None]:
#fit - gradient descent 