# Ce se întamplă în cazul clasificarii binare daca se modifică pragul de decizie din 0.5 în alte valori. Cum se poate aprecia calitatea clasificatorului pentru diferite valori ale pragului?

#### Raspuns: Daca pragul este mai mare (de exemplu 0.8) atunci sansele ca algoritmul sa prezica corect label-ul pozitiv sunt mai mici. Aceasta situatie poate conduce de exemplu la clasificarea unor pacienti bolnavi ca fiind sanatosi, ceea ce nu este de dorit. Daca pragul este mai mic, sansele ca cei bolnavi sa fie clasificati corect sunt mai mari. (nu este atat de grav daca pacientii sanatosi sunt considerati bolnavi) 

In [17]:
def clasificationPerformance(ground_truth, computed_values):
    """
    Returneaza TN (True Negative), FP(False Positive), FN(False Negative), TP(True Positive)
    """
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    
    for i in range(0, len(ground_truth)):
        #consideram malign = positive, benign = negative 
        if ground_truth[i] == "M":
            if computed_values[i] == "M":
                TP += 1
            else:
                FP += 1
        else:
            if computed_values[i] == "B":
                TN += 1
            else:
                FN += 1
    return TN, FP, FN, TP


def getAccuracy(TN, FP, FN, TP):
    """ 
    accuracy represents the overall performance of classification model:
    (TP+TN)/(TN+FP+FN+TP)
    """
    return (TP+TN)/(TN+FP+FN+TP)

def getPrecision(FP, TP):
    """
    precision indicates how accurate the positive predictions are 
    TP/(TP+FP)
    """
    return TP/(TP+FP)

def getRecall(TP, FN):
    """ 
    recall indicates the coverage of actual positive sample
    TP/(TP+FN)
    """
    return TP/(TP+FN)


In [18]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import csv
import os
import matplotlib.pyplot as plt
import numpy as np 
from sklearn import linear_model
import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from scipy.special import expit
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
def readData():
    X = breast_cancer_wisconsin_diagnostic.data.features 
    y = breast_cancer_wisconsin_diagnostic.data.targets 
    data = {
        "radius": preprocessing.normalize([X["radius1"]])[0],
        "texture": preprocessing.normalize([X["texture1"]])[0],
        "diagnosis": y["Diagnosis"]
    }
    dataFrame = pd.DataFrame(data)
    return dataFrame

def plotDataDistribution(dataFrame):
    fig, axes = plt.subplots(nrows=3, figsize=(15,15))
    axes[0].hist(dataFrame["radius"])
    axes[0].set_title("Radius")
    axes[1].hist(dataFrame["texture"])
    axes[1].set_title("Texture")
    axes[2].hist(dataFrame["diagnosis"])
    axes[2].set_title("Diagnosis")
    plt.show()

def plotData(dataFrame):
    ind_malign = [i for i in range(0, dataFrame.shape[0]) if dataFrame["diagnosis"].iloc[i] == 'M'] 
    ind_benign = [i for i in range(0, dataFrame.shape[0]) if dataFrame["diagnosis"].iloc[i] == 'B']
    radius_malign = [dataFrame["radius"].iloc[i] for i in ind_malign]
    radius_benign = [dataFrame["radius"].iloc[i] for i in ind_benign]

    texture_malign = [dataFrame["texture"].iloc[i] for i in ind_malign]
    texture_benign = [dataFrame["texture"].iloc[i] for i in ind_benign]

    fig, axes = plt.subplots()
    axes.plot(radius_malign, texture_malign, 'ro')
    axes.plot(radius_benign, texture_benign, 'go')
    
    axes.set_xlabel('Radius')
    axes.set_ylabel('Texture')
    axes.set_title('Relation between Radius and Texture')

    plt.show()

def getTrainingAndValidationSets(dataFrame):
    dataSize = dataFrame.shape[0]
    
    trainingIndexSet = np.random.choice(range(dataSize), size=int(0.8 * dataSize), replace=False)
    validationIndexSet = [i for i in range(dataSize) if i not in trainingIndexSet]

    trainingInputSet = [[dataFrame["radius"].iloc[index], dataFrame["texture"].iloc[index]] for index in trainingIndexSet]
    trainingOutputSet = [dataFrame["diagnosis"].iloc[index] for index in trainingIndexSet]

    validationInputSet = [[dataFrame["radius"].iloc[index], dataFrame["texture"].iloc[index]] for index in validationIndexSet]
    validationOutputSet = [dataFrame["diagnosis"].iloc[index] for index in validationIndexSet]

    return trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet


def plotTrainingAndValidationSets(dataFrame):
    trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet = getTrainingAndValidationSets(dataFrame)
    ind_malign_test = [i for i in range(0, len(trainingInputSet)) if trainingOutputSet[i] == 'M'] 
    ind_benign_test = [i for i in range(0, len(trainingInputSet)) if trainingOutputSet[i] == 'B']

    ind_malign_antrenament = [i for i in range(0, len(validationInputSet)) if validationOutputSet[i] == 'M'] 
    ind_benign_antrenament = [i for i in range(0, len(validationInputSet)) if validationOutputSet[i] == 'B'] 

    radius_malign_test = [trainingInputSet[i][0] for i in ind_malign_test]
    radius_benign_test = [trainingInputSet[i][0] for i in ind_benign_test]

    radius_malign_antrenament = [validationInputSet[i][0] for i in ind_malign_antrenament]
    radius_benign_antrenament = [validationInputSet[i][0] for i in ind_benign_antrenament]

    texture_malign_test = [trainingInputSet[i][1] for i in ind_malign_test]
    texture_benign_test = [trainingInputSet[i][1] for i in ind_benign_test]

    texture_malign_antrenament = [validationInputSet[i][1] for i in ind_malign_antrenament]
    texture_benign_antrenament = [validationInputSet[i][1] for i in ind_benign_antrenament]

    fig, axes = plt.subplots()
    axes.plot(radius_malign_test, texture_malign_test, 'ro')
    axes.plot(radius_benign_test, texture_benign_test, 'go')
    axes.plot(radius_malign_antrenament, texture_malign_antrenament, 'r^')
    axes.plot(radius_benign_antrenament, texture_benign_antrenament, 'g^')
    
    axes.set_xlabel('Radius')
    axes.set_ylabel('Texture')
    axes.set_title('Relation between Radius and Texture')

    plt.show()
    

def getRegressorFromLibrary(dataFrame):
    trainingInputSet, trainingOutputSet, _, _ = getTrainingAndValidationSets(dataFrame)
    X = [el for el in trainingInputSet]
    regressor = linear_model.SGDClassifier();
    regressor.fit(X, trainingOutputSet)
    return regressor

def getErrors(dataFrame):
    _,_, validationInputSet, validationOutputSet = getTrainingAndValidationSets(dataFrame)
    regressor = getRegressorFromLibrary(dataFrame)
    computedValidationOutputs = regressor.predict(validationInputSet)

    TN, FP, FN, TP = clasificationPerformance(validationOutputSet, computedValidationOutputs)
    accuracy = getAccuracy(TN, FP, FN, TP)
    precision = getPrecision(FP, TP)
    recall = getRecall(TP, FN)
    return accuracy, precision, recall


In [19]:
class MyRegressor:
    def __init__(self) -> None:
        self.coeficienti = []
    
    def computeValue(self, line):
        number = 0
        for i in range(0, len(line)):
            number += line[i] * self.coeficienti[i]
        return expit(number)
    
    def train(self, input, output, learning_rate=0.01, epochs=1000):
        newInput = [[1] + line for line in input]
        
        for feature in range(0, len(newInput[-1])):
            self.coeficienti.append(0.0)

        for epoch in range(0, epochs):
            error = [0 for i in range(0, len(newInput[-1]))]
            for i in range(0, len(newInput)):
                computedValue = self.computeValue(newInput[i])
                if output[i] == "M":
                    err = 1 - computedValue
                else:
                    err = 0 - computedValue

                for j in range(0, len(newInput[-1])):
                    error[j] += err * newInput[i][j]
            
            for coef_index in range(0,len(self.coeficienti)):
                self.coeficienti[coef_index] = self.coeficienti[coef_index] - learning_rate * error[coef_index]/len(newInput)         

    def predict1(self, input):
        output = []
        for line in input:
            value = self.computeValue([1] + line)
            label = None
            if value > 0.5:
                label = "M"
            else:
                label = "B"                
            output.append(label)
        return output
    
    def predict2(self, input):
        output = []
        for line in input:
            value = self.computeValue([1] + line)
            label = None
            if value > 0.8:
                label = "M"
            else:
                label = "B"                
            output.append(label)
        return output
    

def getMyRegressor(dataFrame):
    trainingInputSet, trainingOutputSet, _, _ = getTrainingAndValidationSets(dataFrame)
    X = [el for el in trainingInputSet]
    regressor = MyRegressor()
    regressor.train(X, trainingOutputSet)
    return regressor

def getErrors2(dataFrame):
    _,_, validationInputSet, validationOutputSet = getTrainingAndValidationSets(dataFrame)
    regressor = getMyRegressor(dataFrame)
    computedValidationOutputs = regressor.predict1(validationInputSet)

    TN, FP, FN, TP = clasificationPerformance(validationOutputSet, computedValidationOutputs)
    accuracy = getAccuracy(TN, FP, FN, TP)
    precision = getPrecision(FP, TP)
    recall = getRecall(TP, FN)
    return accuracy, precision, recall

def getErrors22(dataFrame):
    _,_, validationInputSet, validationOutputSet = getTrainingAndValidationSets(dataFrame)
    regressor = getMyRegressor(dataFrame)
    computedValidationOutputs = regressor.predict2(validationInputSet)

    TN, FP, FN, TP = clasificationPerformance(validationOutputSet, computedValidationOutputs)
    accuracy = getAccuracy(TN, FP, FN, TP)
    precision = getPrecision(FP, TP)
    recall = getRecall(TP, FN)
    return accuracy, precision, recall

def predictCancerUsingMyRegressor1(dataFrame, input):
    regressor = getMyRegressor(dataFrame)
    output = regressor.predict1(input)
    return output

def predictCancerUsingMyRegressor2(dataFrame, input):
    regressor = getMyRegressor(dataFrame)
    output = regressor.predict2(input)
    return output

def predictCancerUsingLibraryRegressor(dataFrame, input):
    regressor = getRegressorFromLibrary(dataFrame)
    output = regressor.predict(input)
    return output


In [20]:
dataFrame = readData()
input = [[18, 10]]
myOutput = predictCancerUsingMyRegressor1(dataFrame,input)
myOutput2 = predictCancerUsingMyRegressor2(dataFrame, input)
libraryOutput = predictCancerUsingLibraryRegressor(dataFrame, input)
print("Threshold 0.5: ", myOutput)
print("Threshold 0.8: ", myOutput2)

Threshold 0.5:  ['M']
Threshold 0.8:  ['M']


In [21]:
regressor2 = getMyRegressor(dataFrame)
w0, w1, w2 = regressor2.coeficienti[0], regressor2.coeficienti[1], regressor2.coeficienti[2]
print("LEARNED MODEL: f(x) = ", w0, " + ", w1, " * x1", " + ", w2, " * x2")

accuracy2, precision2, recall2 = getErrors2(dataFrame)
print("THRESHOLD 0.5")
print("Accuracy = ", accuracy2)
print("Precision = ", precision2)
print("Recall = ", recall2)

accuracy22, precision22, recall22 = getErrors22(dataFrame)
print("THRESHOLD 0.8")
print("Accuracy = ", accuracy22)
print("Precision = ", precision22)
print("Recall = ", recall22)

LEARNED MODEL: f(x) =  3.554730954094316  +  0.10944370390955513  * x1  +  0.12862091702695408  * x2
THRESHOLD 0.5
Accuracy =  0.4298245614035088
Precision =  1.0
Recall =  0.4298245614035088
THRESHOLD 0.8
Accuracy =  0.39473684210526316
Precision =  1.0
Recall =  0.39473684210526316


# Rezolvarea unei probleme de regresie/clasificare prin: folosirea validarii încrucișate (K-fold cross validation) - pb cu happiness dupa pib

In [22]:
import csv
import os
import matplotlib.pyplot as plt
import numpy as np 
from sklearn import linear_model
import pandas as pd 
from sklearn.metrics import mean_squared_error


#Ce îi poate face pe oameni fericiți? - dupa PIB
def readData(dataPath: str):
    df = pd.read_csv(dataPath, delimiter=',', header='infer')
    df = df.dropna()
    return df

#split data frame in k sets
def splitDataInKSets(dataFrame, k):
    size = dataFrame.shape[0]
    arr = np.array_split(range(size),k) 
    input = [[dataFrame["Economy..GDP.per.Capita."].iloc[i] for i in index] for index in arr]
    output = [[dataFrame["Happiness.Score"].iloc[i] for i in index] for index in arr]
    return input, output

def getErrors(computed_output, validation_output):
    computedError = mean_squared_error(validation_output, computed_output)
    return computedError

def trainRegressor(regressor, dataFrame, k):
    errors = []
    input, output = splitDataInKSets(dataFrame, k)
    for i in range(0, k):
        validationInputSet = input[i]
        validationOutputSet = output[i]
        trainingInputSet = []
        trainingOutputSet = []
        for j in range(0, k):
            if j != i:
                trainingInputSet += input[j]
                trainingOutputSet += output[j]
        regressor.partial_fit([[trainingInputSet[ind]] for ind in range(0, len(trainingInputSet))], trainingOutputSet)
        computed_output = regressor.predict([[validationInputSet[ind]] for ind in range(0, len(validationInputSet))])
        errors.append(getErrors(computed_output, validationOutputSet))
    return errors

In [23]:
dataFrame = readData("2017.csv")
regressor = linear_model.SGDRegressor()
errors = trainRegressor(regressor, dataFrame, 6)
print(errors)
overallError = sum(errors) / len(errors)
print("Overall error = ", overallError)

[10.240350012252453, 2.29935450966073, 0.4673785604497108, 0.5256477901443344, 0.5536900002234595, 0.399878092935935]
Overall error =  2.4143831609444377


# Investigarea diferitelor funcții de loss - pt pb 1 - PIB

- **Squared Error** 

- **Huber** 
   
- **Epsilon Insensitive** 
   

In [24]:
from sklearn import metrics

def getTrainingAndValidationSets(dfWorldHappiness):
    dataSize = dfWorldHappiness.shape[0]
    
    trainingIndexSet = np.random.choice(range(dataSize), size=int(0.8 * dataSize), replace=False)
    validationIndexSet = [i for i in range(dataSize) if i not in trainingIndexSet]

    trainingInputSet = [dfWorldHappiness["Economy..GDP.per.Capita."].iloc[index] for index in trainingIndexSet]
    trainingOutputSet = [dfWorldHappiness["Happiness.Score"].iloc[index] for index in trainingIndexSet]

    validationInputSet = [dfWorldHappiness["Economy..GDP.per.Capita."].iloc[index] for index in validationIndexSet]
    validationOutputSet = [dfWorldHappiness["Happiness.Score"].iloc[index] for index in validationIndexSet]

    return trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet

def getRegressor(dataFrame, loss_type):
    trainingInputSet, trainingOutputSet, _, _ = getTrainingAndValidationSets(dataFrame)
    X = [[el] for el in trainingInputSet]
    regressor = linear_model.SGDRegressor(loss=loss_type)
    regressor.fit(X, trainingOutputSet)
    return regressor

def main():
    dataFrame = readData("2017.csv")
    _,_,validationInput,validationOutput = getTrainingAndValidationSets(dataFrame)
    for loss_type in linear_model.SGDRegressor().loss_functions:
        regressor = getRegressor(dataFrame, loss_type)
        computedOutput = regressor.predict([[validationInput[i]] for i in range(0, len(validationInput))]) 
        err = metrics.r2_score(validationOutput, computedOutput)
        print("Loss type: ", loss_type, " Error = ", err)

main()    

Loss type:  squared_error  Error =  0.2979756476653449
Loss type:  huber  Error =  -0.04272638685058938
Loss type:  epsilon_insensitive  Error =  0.3118348914352309
Loss type:  squared_epsilon_insensitive  Error =  0.3423477410307654
