In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [2]:
def splitTrainTest(dataset, y, split=0.60):
    """ 
    Separating the dataset into 2 parts: Training Dataset (to train the model) & Test Dataset (to evaluate the performance of the model)
    The rows assigned to each dataset are randomly selected (to ensure that the model is objective).
    randrange() generate a random integer in the range between 0 and the size of the list.

    Parameters:
        dataset: The dataset to split as a list of lists
        split: Split percentage. (default split = 60%) --> A 60/40 for train/test
        
    Returns:
        train: 60% of the dataset
        test: The rows that remain in the copy of the dataset are then returned as the test dataset. (40%)
    """

    #calculate how many rows the training set requires
    xTrain = pd.DataFrame()
    yTrain = pd.DataFrame()
    trainSize = split * len(dataset)
    datasetCopy = dataset.copy()
    yCopy = y.copy()

    #add index column
    datasetCopy.reset_index(inplace=True)
    datasetCopy = datasetCopy.rename(columns={"index": "index"})
    yCopy.reset_index(inplace=True)
    yCopy = yCopy.rename(columns={"index": "index"})

    idxRan = len(datasetCopy)
    while len(xTrain) < trainSize: # while until the train dataset contains the target number of rows.
        randomIndex = np.random.choice(datasetCopy.index, 1, replace=False) #select random rows

        datasetCopy = datasetCopy.drop(datasetCopy["index"][randomIndex]) #remove random rows from the datasetCopy
        yCopy = yCopy.drop(yCopy["index"][randomIndex]) #remove random rows from the datasetCopy

        xTrain = pd.concat([xTrain, dataset.loc[randomIndex]]) #add rows to train dataset
        yTrain = pd.concat([yTrain, y.loc[randomIndex]]) #add rows to train dataset
        idxRan = idxRan - 1
    
    datasetCopy = datasetCopy.drop(labels=["index"], axis=1)
    yCopy = yCopy.drop(labels=["index"], axis=1)
    return xTrain, datasetCopy, yTrain, yCopy

In [3]:
def getData():
    """
    Args:
        file (str, optional): Location of Iris.csv file. Defaults to 'Iris.csv'.
        binary_version (bool, optional): Select if binary labels are used. Defaults to True.
            target variable will select the positive label.

    Returns:
        array: Features and labels in the las column.
        dict: Encodig of labels to string.
    """
    iris = datasets.load_iris()
    df = pd.DataFrame(iris.data, columns = iris.feature_names)
    y = pd.DataFrame(iris.target, columns = ["Target"])

    xTrain, xTest, yTrain, yTest = splitTrainTest(df,y)

    return xTrain, xTest, yTrain, yTest

In [4]:
class StandarScaler:
    """ 
    Standardize features by removing the mean and scaling to unit variance. 
    z = (x - MEAN) / DESV EST 
    """
    def __init__(self):
        pass

    def fit(self, X):
        X = pd.DataFrame(X)
        self.mean = X.mean(axis = 0).to_numpy()
        self.std = X.std(axis = 0).to_numpy()
        
    def transform(self, X):
        X -= self.mean
        X /= self.std
        return X 
    
    def fitTransform(self, X):
        self.fit(X)
        df = self.transform(X)
        return df 

In [5]:
def oneHot(y):
    """
    Converts the training data into a series of ones and zeros for the classes given.
    """
    yEncoded = np.zeros(shape=(y.size, int(y.max()[0])+1))
    y = y.Target.tolist()
    for i in range(len(y)): # rows
        yEncoded[i,y[i]] = 1 # sub fila, sub columna que diga y
    return yEncoded

In [15]:
class MultiLogisticReg():
    def __init__(self):
        pass

    def softMaxFunc(self, z):
        sigm = []
        for i in range(len(z)):
            lista = np.exp(z[i])/sum(np.exp(z[i]))
            sigm.append(lista.tolist())
        return sigm
    
    def getWeights(self,X,y):
        cantTargs = np.shape(y)[1]
        feats = np.shape(X)[1] # columnas 
        self.weight = np.random.rand(feats, cantTargs) # array FEATURE * CLASES

    def fit(self,X,y, rounds = 1000, lRate=0.01):
        """
        weighted sum of the inputs plus a bias term 
        For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ 
        handle multinomial loss; SOFTMAX
        """
        self.X = X 
        self.y = y
        
        self.getWeights(X,y)
        weight = self.weight # a optimizar # random initialization of w a

        weight = self.gradientDescent(rounds, weight, lRate, y)
            
    def gradientDescent(self, rounds, weight, lRate, y):
        losses = []
        rows = y.shape[0]
        count = 0
        while count < rounds:
            z = np.dot(self.X, weight) # (w·x)
            yG = self.softMaxFunc(z) # yˆ = σ(w·x) Ya son probs

            loss = self.lossFunction(z, y) # loss entre ygorrito y y train
            losses.append(loss)

            gradient = 1/ rows * np.dot(self.X.T, (y - yG))

            #updtate a lo weights
            weight -= lRate * gradient
            count += 1 

        self.lossSteps = losses
        self.weight = weight
        return weight

    def lossFunction(self, z, y):
        """
        Calculate cross-entropy loss
        The loss increases as the predicted probability diverge from the actual label.

        Parameters:
        yG:
        y:

        Returns:
        loss: Average cross entropy loss
        """

        # Y must be one-hot encoded
        rows = y.shape[0]
        loglLoss = 1/rows * (np.trace(np.dot(z, y.T)) + np.sum(np.log(np.sum(np.exp(z), axis=1))))
        return loglLoss

    def predict(self, X):
        z = np.dot(X, self.weight)
        P = self.softMaxFunc(z)
        predictions = np.argmax(P, axis=1)
        return predictions

    def predictProba(self, X):
        z = np.dot(X, self.weight)
        probs = self.softMaxFunc(z)
        return probs


In [7]:
xTrain, xTest, yTrain, yTest = getData()

In [9]:
ss = StandarScaler()

xTrain = ss.fitTransform(xTrain)
xTest = ss.fitTransform(xTest)

In [10]:
yTrain = oneHot(yTrain)
yTest = oneHot(yTest)

In [16]:
mlr = MultiLogisticReg()

In [17]:
mlr.fit(xTrain, yTrain)

In [18]:
yPred = mlr.predict(xTest)
yProbs = mlr.predictProba(xTest)

In [None]:
def accuracy(yTest, yPred):
    """
    Classification accuracy is a ratio of the number of correct predictions out of all predictions that were made.
    accuracy = TP+TN / FP+FN+TP+TN
    """
    # -- OPCION 1 --
    #TRUE positive
    TP = sum((yTest == 1) & (yPred == 1))

    #FALSE positive
    FP = sum((yTest == 0) & (yPred == 1))

    #FALSE positive
    FN = sum((yTest == 1) & (yPred == 0))

    #TRUE negative
    TN = sum((yTest == 0) & (yPred == 0))
    
    return (TP + TN)/(FP + FN + TP + TN)

    # -- OPCION 2 --
    correct = 0
    for i in range(len(yTest)):
        if yTest[i] == yPred[i]:
            correct += 1
    # return correct / float(len(yTest)) 

In [None]:
def precision(yTest, yPred):
    """
    Precision is the ratio between the true positives and all the points that are classified as positives.
    precision = TP/(TP + FP)
    """

    #TRUE positive
    TP = sum((yTest == 1) & (yPred == 1))

    #FALSE positive
    FP = sum((yTest == 0) & (yPred == 1))

    return TP / (TP + FP)

    # -- OPCION 2 --
    tp2 = 0
    fp2 = 0
    for i in range(len(yTest)):
        if yTest[i] == yPred[i]:
            tp2 += 1
            
        if yTest[i] != yPred[i]:
            fp2 += 1 
    # return float(tp2 / (tp2 + fp2))

In [None]:
def recall(yTest, yPred):
    """
    Recall is the measure of the model correctly identifying true positives. 
    recall = TP/(TP + FN)
    """

    #TRUE positive
    TP = sum((yTest == 1) & (yPred == 1))

    #FALSE negative
    FN = sum((yTest == 1) & (yPred == 0))

    return TP / float(TP + FN)

    # -- OPCION 2 --
    tp2 = 0
    fn2 = 0
    for i in range(len(yTest)):
        if yTest[i] == yPred[i]:
            tp2 += 1
            
        if yTest[i] != yPred[i]:
            fn2 += 1
    # return float(tp2 / (tp2 + fn2))

In [None]:
def F1score(yTest, yPred):
    """
    F1 score is the combination of precision and recall. 
    F1 score = (2 * precision * recall) / (precision + recall)
    """

    Precision = precision(yTest, yPred)
    Recall = recall(yTest, yPred)

    return (2 * Precision * Recall) / (Precision + Recall)

In [8]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

sc = StandardScaler()
X_train = sc.fit_transform(xTrain)
X_test = sc.transform(xTest)

oh = OneHotEncoder()
# y_Train =  yTrain #oh.fit_transform(yTrain).toarray()
# y_Test =  oh.fit_transform(yTest).toarray()

classifier = LogisticRegression(random_state = 0, solver='lbfgs', multi_class='auto')
classifier.fit(X_train, yTrain)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_probs = classifier.predict_proba(X_test)
y_pred

  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [19]:
yPred

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])