# 1. Load cleaned dataset

In [None]:
#downloaded the dataset from the UCI repository archive 
#(https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease)

#decompressed the RAR file 
#turned the arff file into csv (using Python converter)

# now loading full csv into the notebook

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('data/cleaned.csv', header = 0, on_bad_lines='skip')
data = data.drop(data.columns[0], axis=1)

In [None]:
data

# 2. Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#target variable
y = data['class']
y

#predictors
x = data.copy()
x.drop('class', inplace=True, axis=1)

In [None]:
#split train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
#testsize not too big  because I have relatively few records
#randomstate for reproducibility

# 3. Fitting classification models and first performance assessment 

In [None]:
import timeit
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

## 3.1 Linear methods

### 3.1.1  Logistic regression

In [None]:
#learns the probability of a sample belonging to a certain class 

# discriminative model 
#(=directly models the posterior probability of P(y|x) y learning the input to output mapping by minimising error)
#                      #posterior=update of prob of event A happening given new info as event B happening

In [None]:
start = timeit.default_timer() #for all models I record the time

#instance of the model 
logreg = LogisticRegression()

#the model learns the relationship between predictors and label 
logreg.fit(X_train, y_train)

#predict the label on test set 
predictionsLR = logreg.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start) 

predictionsLR

In [None]:
cmLR = confusion_matrix(y_test,predictionsLR)
cmLR

In [None]:
#LOW PRIORITY
#TODO: Definire un'unica funzione per tutte le metriche invece che definirle una per una ogni volta 

"""
def matrix_metrix(real_values,pred_values):
    CM = confusion_matrix(real_values,pred_values) #get confusion matrix
    
    TN = CM[0][0]     #confusion matrix entries and n° of samples
    FN = CM[1][0] 
    TP = CM[1][1]
    FP = CM[0][1]
    tot = TN+FN+TP+FP
    
    #performance metrics with 2 matrix entries
    Prevalence = round( (TP+FP) /tot,2)
    Accuracy   = round( (TP+TN) / tot,4)
    Precision  = round( TP / (TP+FP),4 )
    NPV        = round( TN / (TN+FN),4 ) 
    FDR        = round( FP / (TP+FP),4 )
    FOR        = round( FN / (TN+FN),4 ) 
    check_Pos  = Precision + FDR
    check_Neg  = NPV + FOR
    
    #performance metrics with more than 2 entries -> more comprehensive metrics
    Recall     = round( TP / (TP+FN),4 )
    FPR        = round( FP / (TN+FP),4 ) #false positive rate
    FNR        = round( FN / (TP+FN),4 ) #false negative rate
    TNR        = round( TN / (TN+FP),4 ) #true negative rate 
    check_Pos2 = Recall + FNR
    check_Neg2 = FPR + TNR
    
    LRPos      = round( Recall/FPR, 4 )   #positive likelihood
    LRNeg      = round( FNR / TNR ,4 )   #negative likelihood 
    
    DOR        = round( LRPos/LRNeg)
    F1         = round ( 2 * ((Precision*Recall)/(Precision+Recall)),4)
    #FBeta      = round ( (1+beta**2)*((Precision*Recall)/((beta**2 * Precision)+ Recall)) ,4)
    MCC        = round ( ((TP*TN)-(FP*FN))/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))  ,4)
    BM         = Recall+TNR-1
    MK         = Precision+NPV-1
    mat_met = pd.DataFrame({
        'Metric':['TP','TN','FP','FN','Prevalence','Accuracy','Precision','NPV','FDR','FOR','check_Pos','check_Neg','Recall','FPR','FNR','TNR','check_Pos2','check_Neg2','LR+','LR-','DOR','F1','MCC','BM','MK'], #,'FBeta'    
        'Value':[TP,TN,FP,FN,Prevalence,Accuracy,Precision,NPV,FDR,FOR,check_Pos,check_Neg,Recall,FPR,FNR,TNR,check_Pos2,check_Neg2,LRPos,LRNeg,DOR,F1,MCC,BM,MK]}) #FBeta
    return (mat_met)
    
"""

In [None]:
#DOR = (TP / (TP+FN)/FP / (TN+FP))/(FN / (TP+FN)/TN / (TN+FP))

In [None]:
#Accuracy
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

accuracyLR = round(logreg.score(X_test, y_test),4)
print(accuracyLR)


#COMPREHENSIVE METRICS

#MCC/phi coefficient
#essentially a correlation coefficient between -1(inverse prediction) and 1 (with being 0 average random prediction)
#takes into account true and false positives and negatives 
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html
MCCLR = round(matthews_corrcoef(y_test, predictionsLR),4)
print(MCCLR)


#DOR 
#=TP/FN*FP/TN
#DOR = (TP / (TP+FN)/FP / (TN+FP))/(FN / (TP+FN)/TN / (TN+FP))
TPLR = cmLR[0][0]
FNLR = cmLR[0][1]
FPLR = cmLR[1][0]
TNLR = cmLR[1][1]

#DOR = TPLR/FNLR*FPLR/TNLR
#print(DOR)


#F1_score = harmonic mean of precision and recall (0=worst, 1=best)
#F1=2*(precision*recall)/(precision+recall)
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
F1LR = f1_score(y_test, predictionsLR, 
                average=None) #only required for multiclass targets
print(F1LR)

In [None]:
def compute_metrics(cm): 
    import math
        
    TN = cm[0][0]     #confusion matrix entries and n° of samples
    FN = cm[1][0] 
    TP = cm[1][1]
    FP = cm[0][1]
    tot = TN+FN+TP+FP
    
    accuracy = round( (TP+TN) / tot,4)
    precision = round( TP / (TP+FP),4 )
    recall = round( TP / (TP+FN),4 )
    MCC = round ( ((TP*TN)-(FP*FN))/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))  ,4)
    F1 = round ( 2 * ((precision*recall)/(precision+recall)),4)
    mat_met = pd.DataFrame({
        'Metric':['Accuracy','Precision','Recall','F1','MCC'],
        'Value':[accuracy,precision,recall,F1,MCC]}) 
    return (mat_met)

In [None]:
compute_metrics(cmLR)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cmLR, annot=True, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Logistic Regression \n Accuracy Score: {0} \n MCC score: {1}'.format(round(accuracyLR,4),round(MCCLR,4))
plt.title(all_sample_title, size = 15);

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(9,9))
sns.heatmap(cmLR, annot=True,annot_kws={"size": 35 / np.sqrt(len(cmLR))}, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Logistic Regression \n Accuracy Score: {0}\n MCC Score: {1}\n'.format(round(accuracyLR,4),'0.9835')
plt.title(all_sample_title, size = 17,fontweight="bold");

In [None]:
#molto alto 

#errore è effettivamente sul test quindi sta performando molto bene 
#che è un po' sospetto ma vedo prima tutti gli altri

#assumptions della logreg: 
#- independence of errors, 
#- linearity in the logit for continuous variables, 
#- absence of multicollinearity
#- lack of strongly influential outliers

### 3.1.2  Naive Bayes 

In [None]:
#Naive Bayes classifier assumes that the effect of a particular feature in a class is independent of other features

#1. calculates prior probability for a given class label 
#2. calculate conditional probability with each attribute for each class
#3. multiply same class conditional probability
#4. multiply prior probability with step 3 probability
#5. sees which class has higher probability, higher probability class belongs to given input set step

# generative model
# (= models the joint distribution of the feature X and the targetY, 
#    and then predicts the posterior probability given as P(y|x))

In [None]:
#import the model 
from sklearn.naive_bayes import GaussianNB

start = timeit.default_timer()

#make an instance of the model 
NB = GaussianNB()

#make the model learn the relationship between predictors and label 
NB.fit(X_train, y_train)

#predict the label on test set 
predictionsNB = NB.predict(X_test)
stop = timeit.default_timer()
print('Time: ', stop - start) 

predictionsNB

In [None]:
cmNB = confusion_matrix(y_test,predictionsNB)
cmNB

In [None]:
compute_metrics(cmNB)

In [None]:
accuracyNB = NB.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmNB, annot=True, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Naive Bayes \nAccuracy Score: {0} \n MCC score: {1}'.format(round(accuracyNB,4), '0.9677' )
plt.title(all_sample_title, size = 15)
plt.show()

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(9,9))
sns.heatmap(cmNB, annot=True,annot_kws={"size": 35 / np.sqrt(len(cmNB))}, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Naive Bayes \n Accuracy Score: {0}\n MCC Score: {1}\n'.format(round(accuracyNB,4),'0.9677')
plt.title(all_sample_title, size = 17,fontweight="bold");

In [None]:
#anche questo molto alto 
#ma probabilmente è solo che i modelli lineari si prestano particolarmente a questo dataset 
#che è piccolo, non high dimensional e abbastanza omogeneo (anche per come sono ho gestito i missing values)

## 3.2 Non linear: KNN

In [None]:
#does not make any assumption on the data distribution (non parametric)

#KNN can be summarized as below:
#1.Computes the distance between the new data point with every training example.
#2.For computing the distance measures such as Euclidean distance, Hamming distance or Manhattan distance will be used.
#3.Model picks K entries in the database which are closest to the new data point.
#4.Then it does the majority vote i.e the most common class/label among those K entries will be the class of the new data point.

In [None]:
#import the model 
from sklearn.neighbors import KNeighborsClassifier

#different K to check which to choose
k_range = range(1,26)
scores = {}
scores_list = []
for k in k_range: 
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    predKNN = knn.predict(X_test)
    scores[k] = metrics.accuracy_score(y_test,predKNN)
    scores_list.append(metrics.accuracy_score(y_test,predKNN))

In [None]:
%matplotlib inline

plt.plot(k_range, scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

#best appears to be at 3

In [None]:

start = timeit.default_timer()

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train,y_train)

predKNN_3k = knn.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start) 

predKNN_3k

In [None]:
cmKNN = metrics.confusion_matrix(y_test,predKNN_3k)
cmKNN

In [None]:
compute_metrics(cmKNN)

In [None]:
accuracyKNN = knn.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmKNN, annot=True, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'KNN \n Accuracy Score: {0} \n MCC Score: {1}'.format(round(accuracyKNN,4), 0.5679)
plt.title(all_sample_title, size = 15);

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(9,9))
sns.heatmap(cmKNN, annot=True,annot_kws={"size": 35 / np.sqrt(len(cmKNN))}, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'K-Nearest Neighbors \n Accuracy Score: {0}\n MCC Score: {1}\n'.format(round(accuracyKNN,4),'0.5679')
plt.title(all_sample_title, size = 17,fontweight="bold");

In [None]:
#particolarmente basso rispetto agli altri 

#probabilmente appunto è la questione della linearità perchè pure i tree performano peggio 

## 3.3 Tree based methods

Tend to overfit

### 3.3.1 Decision Tree

In [None]:
#DECISION TREE

#import model
from sklearn.tree import DecisionTreeClassifier

start = timeit.default_timer()

#make instance of the model 
dt = DecisionTreeClassifier(random_state=33)

#fit the classifier
dt = dt.fit(X_train, y_train)

#predict response
predDT = dt.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start) 

predDT

In [None]:
cmDT = metrics.confusion_matrix(y_test,predDT)
cmDT

In [None]:
compute_metrics(cmDT)

In [None]:
accuracyDT = dt.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmDT, annot=True, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Decision Tree \n Accuracy Score: {0} \n MCC score: {1}'.format(round(accuracyDT,4),'0.9178')
plt.title(all_sample_title, size = 15);

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(9,9))
sns.heatmap(cmDT, annot=True,annot_kws={"size": 35 / np.sqrt(len(cmDT))}, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Decision Tree \n Accuracy Score: {0}\n MCC Score: {1}\n'.format(round(accuracyDT,4),'0.9178')
plt.title(all_sample_title, size = 17,fontweight="bold");

In [None]:
compute_metrics(cmDT)

In [None]:
#con i tree va peggio - coerente con il peggioramento del KNN

In [None]:
#!Pip install graphviz
#!pip install pydotplus

In [None]:
from six import StringIO

In [None]:
from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
 
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = list(X_train.columns),class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('DecisionTree.png')
Image(graph.create_png())

### 3.3.2 Bagging Decision Tree (Ensemble learning I)

In [None]:
# taking bootstraps from the training data (=bagging)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
start = timeit.default_timer()

bg = BaggingClassifier(DecisionTreeClassifier(), 
                      max_samples = 0.5,  #maximum size: 50% di tutto il dataset per ogni sample
                      max_features = 1.0, #maximum of features: con 1 è 100% quindi tutte le 48 features
                      n_estimators = 10)  #number of estimators: il numero di decision trees

bg.fit(X_train, y_train)

predBG = bg.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start) 

predBG

In [None]:
cmBG = metrics.confusion_matrix(y_test,predBG)
cmBG

In [None]:
accuracyBG = dt.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmBG, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score for Tree with Bagging: {0}'.format(accuracyBG)
plt.title(all_sample_title, size = 15);

In [None]:
#potrei anche risparmiare questi altri modelli di tree dato che già capisco che tree is not the way to go
#ma voglio vedere se è così peggiorata perchè è solo un tree o con gli ensemble migliora

#perchè i tree hanno la tendenza a overfittare in generale

### 3.3.3 Boosted Decision Tree (Ensemble learning II)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
start = timeit.default_timer() 

adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,
                                                max_depth=4),
                                               n_estimators=10,
                                               learning_rate=0.6)

adb.fit(X_train, y_train)

predBS = adb.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start)
predBS

In [None]:
cmBS = metrics.confusion_matrix(y_test,predBS)
cmBS #tutti true 

In [None]:
accuracyBS = dt.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmBS, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score for Tree with Boosting (AdaBoost): {0}'.format(accuracyBS)
plt.title(all_sample_title, size = 15);

In [None]:
#ormai è accanimento terapeutico ma a questo punto vedo anche con random fores

#se facessi più valutazioni sugli iperparametri forse migliorerebbe 

### 3.3.4 Random Forest (Ensemble learning III)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
start = timeit.default_timer()

rf = RandomForestClassifier(n_estimators=30, max_depth=9)

rf.fit(X_train, y_train)

predRF = rf.predict(X_test)

stop = timeit.default_timer()
print('Time: ', stop - start)

predRF

In [None]:
cmRF = metrics.confusion_matrix(y_test,predRF)
cmRF #tutti true 
accuracyRF = dt.score(X_test, y_test)

plt.figure(figsize=(9,9))
sns.heatmap(cmRF, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score for Tree with Random Forests: {0}'.format(accuracyRF)
plt.title(all_sample_title, size = 15);

In [None]:
#va beh sembra che con quelli lineari vada meglio

## 3.4 Support vector machines

In [None]:
#kernel (=transforms an input data space into the required form) trick to handle nonlinear input spaces 
#(to transform the input space to a higher dimensional space so that 
#then one can easily separate the classes using linear separation)

#The classifier separates data points using a hyperplane with the largest amount of margin. 
#That's why an SVM classifier is also known as a discriminative classifier. 


In [None]:
from sklearn import svm

start = timeit.default_timer()

vect = svm.SVC(kernel='linear')

vect.fit(X_train, y_train)

predvect = vect.predict(X_test)


stop = timeit.default_timer()
print('Time: ', stop - start)

predvect

In [None]:
cmSVM = metrics.confusion_matrix(y_test,predvect)
accuracySVM = vect.score(X_test, y_test)

sns.set(font_scale=1.4)
plt.figure(figsize=(9,9))
sns.heatmap(cmSVM, annot=True,annot_kws={"size": 35 / np.sqrt(len(cmSVM))}, linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Support Vector Machines \n Accuracy Score: {0}\n MCC Score: {1}\n'.format(round(accuracySVM,4),'0.9672')
plt.title(all_sample_title, size = 17,fontweight="bold");

In [None]:
cmSVM

In [None]:
compute_metrics(cmSVM)

In [None]:
#questo performa bene ma più tempo

In [None]:
#hyperparameters:
#-kernel
#-regularization
#–gamma

In [None]:
#advantages:
#–good accuracy and faster predictions wrt NB
#-less memory usage because it uses a subset of training points in the decision phase
#-works well with high dimensional space (here the features are not too many so the advantage is not extremely 
# harnessed but still good performance)

#disadvantages:
#-not suitable for large datasets because of high training time (non è questo il caso)
#–sensitive to the type of kernel used 

# 4. Visualization of the tetrahedron

# Plotly

Built on top of the Plotly Javascript library (plotly.js), Plotly is an open-source plotting library that enables the creation of interactive web-based visualizations. I use Plotly for three main reasons:  
- extreme customization
- possibility to display visualizations within Jupyter Notebooks, to save them to standalone html files but also to serve them as part of analytical web-applications using Dash (https://dash.plotly.com/installation) 
<br>

An alternative could have been Ipyvolume which is still open source 

#Python's visualisation landscape
#using markdown ![viz](viz_landscape.jpeg)



In [None]:
import numpy as np
import plotly.graph_objects as go
import plotly.express as px 


In [None]:
#riprendo le cm dai modelli
cmLR

In [None]:
cmNB

In [None]:
cmSVM

In [None]:
cmKNN

In [None]:
cmDT 

In [None]:
#definisco una confusion matrix che risulterebbe da una classificazione non auspicabile 
#per avere l'esempio del punto nel teatredro se poor performance
cm_badc=np.array([[10,30],
         [55,5]])           #confusion matrix fittizia
cm_badc

In [None]:
#PARTIAL RESULT 

#TETRAEDRO MESH CON PUNTI DEI MODELLI



fig = go.Figure(data=[
    go.Mesh3d(
        x=[0, 0, 0, 1], #verticalmente ho definito O, C, B, A
        y=[0, 0, 1, 0],
        z=[0, 1, 0, 0],
        
        colorbar_title='z',
        
        colorscale=[[0, 'gold'],
                    [0.5, 'mediumturquoise'],
                    [1, 'magenta']],
        
        #colors = colorRamp(c("red",'yellow','white','green','blue')),
        
        # Intensity of each vertex, which will be interpolated 
        #and color-coded
        intensity=[0, 0.33, 0.66, 1],
        
        opacity = 0.5, #for transparency
        
        # i, j and k sono i vertici dei triangoli
        # here we represent the 4 triangles of the tetrahedron surface
        i=[0, 0, 0, 1],
        j=[1, 2, 3, 2],
        k=[2, 3, 1, 3],
        name='y',
        showscale=True
    )
])


#PRIMA QUESTIONE 
#aggiungere il punto per la confusion matrix dello use case specifico 
#-> risolto, non metto tutti i modelli ma tipo i due migliori e uno meno performante (da confusion matrix esempio)

#Modello LogReg 
N = sum(sum(cmLR))                  #confusion matrix entries divided by N
xLR = [cmLR[0][0]/N] #TP/N
yLR = [cmLR[1][1]/N] #TN/N
zLR = [cmLR[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xLR, 
                           y= yLR, 
                           z= zLR, 
                           marker = dict(color='green', size=5), showlegend=False))

""" intanto commentato per mantenere viz più veloce mentre faccio tutto il resto
#Modello DT
xDT = [cmDT[0][0]/N] #TP/N
yDT = [cmDT[1][1]/N] #TN/N
zDT = [cmDT[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xDT, 
                           y= yDT, 
                           z= zDT, 
                           marker = dict(color='green', size=5), showlegend=False))

#se poi voglio mettere tutti gli altri modelli li aggiungo qui 
#Modello NB

#Modello KNN

#Modello SVM 
"""

#Per avere l'esempio di una confusion matrix 
#sono tutte relativamente buone confusion matrices 
#quindi faccio esempio di confusion matrix che uscirebbe da una classificazione non accurata 
#per far vedere la differenza 
xB = [cm_badc[0][0]/N] #TP/N
yB = [cm_badc[1][1]/N] #TN/N
zB = [cm_badc[0][1]/N] #FP/N
fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xB, 
                           y= yB, 
                           z= zB, 
                           marker = dict(color='red', size=5), showlegend=False))






#SECONDA QUESTIONE
#devo capire come distinguere le due sfumature per avere ABC e AOB verdi/blu
#                                                        AOC e BOC arancione/gialli
#le sfumature tral'altro lungo OC e lungo AB 
#-> gradiente come funzione lineare dell'indicatore


#devo generare ogni possibile confusion matrix per ogni combinazione
#di TP,TN,FP (diviso n)
#calcolare mcc associato a ciascuna di questa matrice 
#rendere il colore 

#potrei dividere le due mesh (verde/blu e giallo/rosso)

#verosimilmente discretizzo (1M points diceva) e per ogni punto calcolo MCC
#http://al-roomi.org/3DPlot/index.html


#piano passante per 3 punti
#ax+by+cz+d=0
#x+y+z-1=0 equazione del piano nostro http://al-roomi.org/3DPlot/index.html

#-> la seconda questione poi la risolvo sotto (con meno punti di 1M perchè ci mette la vita altrimenti)

#TODO: unire punto di performance dei modelli a tetraedro con tutte le confusion matrices sotto


fig.show()


#THIS RESULT IS GOOD BUT I WANT TO COLOR THE TETRAHEDRON ACCORDING TO THE PERFORMANCE METRIC (MCC, DOR, etc)

In [None]:
import math

In [None]:
#HERE I GENERATE THE DATA FOR THE GRADIENT COLOR OF THE TETRAHEDRON  
#CIOè MI CREO IL DATAFRAME CON LE 3 COORDINATE E L'ASSOCIATO MCC


#devo capire come distinguere le due sfumature per avere ABC e AOB verdi/blu
#                                                        AOC e BOC arancione/gialli
#le sfumature tral'altro lungo OC e lungo AB 
#-> gradiente come funzione lineare dell'indicatore

#start = timeit.default_timer()
#devo generare ogni possibile confusion matrix per ogni combinazione
#di TP,TN,FP (diviso n)
#calcolare mcc associato a ciascuna di questa matrice 
#rendere il colore 

#potrei dividere le due mesh (verde/blu e giallo/rosso)

#verosimilmente discretizzo (1M points diceva) 
#e per ogni punto calcolo MCC


#piano passante per 3 punti
#ax+by+cz+d=0
#x+y+z-1=0 equazione del piano nostro http://al-roomi.org/3DPlot/index.html


#definisco 3 vettori x,y,z discretizzati da 0 a 1
xd = np.linspace(0,1, num=100) #metto intanto pochi punti
yd = np.linspace(0,1, num=100)
zd = np.linspace(0,1, num=100)

#voglio creare un vettore di x,y,z per avere tutti i punti del cubo 
#lo faccio definendo 
#x>0 con x= 0->1
#y>0     y= 0->1
#z>0     z= 0->1

#e poi filtrare per quelli che stanno sotto il piano x+y+z-1=0
#cioè z < 1 - x - y (seconda condizione da soddisfare)
#points = [xd.T,yd.T,zd.T] #per avere matrice di vettori colonna
#points

#points=np.meshgrid(xd,yd,zd,indexing='ij')
#points

all_p_array = np.array(np.meshgrid(xd, yd, zd)).T.reshape(-1,3)
#len(all_p_array) just to check

column_values = ['x','y','z']
all_p_df = pd.DataFrame(data=all_p_array,
                        columns=column_values)
all_p_df
#print(len(all_p_df))

#adesso voglio droppare le righe se stanno sopra il piano
#cioè se z > 1 -x -y
filtered = all_p_df.query('z<1-x-y')
filtered
#ha senso che sia 1/5 perchè ne avrai che si ripetono 4 e uno in centro

#aggiungo colonna MCC
#sklearn usa le predizioni non le entries della confusion matrix 
#quindi lo ridefinisco 
#MCC = (TP*TN -FP*FN)/sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN))
#x è TP quindi row.x
#y è TN quindi row.y
#z è FP quindi row.z
#    FN = 1-FP-TP-TN quindi 1-row.z-row.x-row.y
#è scaling invariant quindi facendo i conti n va via
filtered['MCC'] = filtered.apply(
    lambda row: round((row.x*row.y - row.z*(1-row.z-row.x-row.y))/math.sqrt((row.x+row.z)*(row.x+(1-row.z-row.x-row.y))*(row.y+row.z)*(row.y +(1-row.z-row.x-row.y))),3),
    axis=1)

filtered['MCC'] = filtered['MCC'].replace(np.nan, 0) #se un'intera riga o colonna è 0 
                                                     #non è definito MCC ma limite tende a 0 quindi sostituisco

filtered


In [None]:
#UTILIZZANDO QUESTO DATAFRAME MI PLOTTO IL MODEL ON THE BASIS OF THE OBTAINED RESULTS (i.e. filtered dataframe)

#SO FAR I ONLY HAVE TETRAHEDRON WITH THE MODELS AND THE TETRAHEDRON WITH THE GRADIENT SEPARATELY
#I NEED TO COMBINE THE TWO VISUALIZATIONS IN ONE SINGLE VIEW


#----------- Plot the models 




fig = go.Figure(data=[
    go.Mesh3d(
        x=[0, 0, 0, 1], #verticalmente ho definito O, C, B, A
        y=[0, 0, 1, 0],
        z=[0, 1, 0, 0],
        
        #colorbar_title='z',
        
        #colorscale=[[0, 'gold'],
        #            [0.5, 'mediumturquoise'],
        #            [1, 'magenta']],
        #colors = colorRamp(c("red",'yellow','white','green','blue')),
        
        # Intensity of each vertex, which will be interpolated and color-coded
        intensity=[0, 0.33, 0.66, 1],
        
        opacity = 0.1, #for transparency
        
        # i, j and k sono i vertici dei triangoli
        # here we represent the 4 triangles of the tetrahedron surface
        i=[0, 0, 0, 1],
        j=[1, 2, 3, 2],
        k=[2, 3, 1, 3],
        name='y',
        showscale=True
    )
])


#PRIMA QUESTIONE 
#aggiungere il punto per la confusion matrix dello use case specifico 
#-> risolto, non metto tutti i modelli ma tipo i due migliori e uno meno performante (da confusion matrix esempio)

#Modello LogReg 
N = sum(sum(cmLR))                  #confusion matrix entries divided by N
xLR = [cmLR[0][0]/N] #TP/N
yLR = [cmLR[1][1]/N] #TN/N
zLR = [cmLR[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xLR, 
                           y= yLR, 
                           z= zLR, 
                           marker = dict(color='green', size=5), showlegend=False))


#Per avere l'esempio di una confusion matrix 
#sono tutte relativamente buone confusion matrices 
#quindi faccio esempio di confusion matrix che uscirebbe da una classificazione non accurata 
#per far vedere la differenza 
xB = [cm_badc[0][0]/N] #TP/N
yB = [cm_badc[1][1]/N] #TN/N
zB = [cm_badc[0][1]/N] #FP/N
fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xB, 
                           y= yB, 
                           z= zB, 
                           marker = dict(color='red', size=5), showlegend=False))
fig.show()


#---------- COSTRUZIONE DEL TETRAEDRO CON LA SFUMATURA
#---------- Da qui sotto solo per il tetraedro con tutte le confusion matrix e indicatore 

#SECONDA QUESTIONE
#devo capire come distinguere le due sfumature per avere ABC e AOB verdi/blu
#                                                        AOC e BOC arancione/gialli
#le sfumature tral'altro lungo OC e lungo AB 
#-> gradiente come funzione lineare dell'indicatore


#devo generare ogni possibile confusion matrix per ogni combinazione
#di TP,TN,FP (diviso n)
#calcolare mcc associato a ciascuna di questa matrice 
#rendere il colore 

#potrei dividere le due mesh (verde/blu e giallo/rosso)

#verosimilmente discretizzo (1M points diceva) e per ogni punto calcolo MCC
#http://al-roomi.org/3DPlot/index.html


#piano passante per 3 punti
#ax+by+cz+d=0
#x+y+z-1=0 equazione del piano nostro http://al-roomi.org/3DPlot/index.html

#-> la seconda questione poi la risolvo sotto (con meno punti di 1M perchè ci mette la vita altrimenti)

#TODO: unire punto di performance dei modelli a tetraedro con tutte le confusion matrices sotto


fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.3,
                    color='MCC'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))

fig.show()



#OK NOW THE THING IS THAT I WANT TO COMBINE THESE TWO VIZ 

In [None]:
#DA QUI LE VIZ SONO TUTTE COMMENTATE PER EVITARE CHE CI METTA TROPPO TEMPO A CARICARE QUANDO LO APRO 

#VANNO SCOMMENTATE ALL'OCCORRENZA (l'idea è fare un .py per 1) visualizzazione punti su tetraedro



"""
#GENERATI I DATI VOGLIO PLOTTARE UN TETRAEDRO 
#CON COLORE DIVERSO A SECONDA DELLA METRICA
#E VOGLIO AGGIUNGERCI IL PUNTO IN BASE ALLA CONFUSION MATRIX 
#CHE VIENE INSERITA


#l'ideale sarebbe stato arrivare a un'applicazione in cui uno sceglie 
#la metrica con cui visualizzare il tetraedro
#inserisce la sua confusion matrix 
#e io gli visualizzo il risultato 

#QUESTA LA STRUTTURA CON IL COLORE SULLA BASE DELLA METRICA SCELTA 
#QUI MCC 

fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.5,
                    color='MCC'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))



#Qui aggiungo i punti per i modelli 
#Modello LogReg 
N = sum(sum(cmLR))                  #confusion matrix entries divided by N
xLR = [cmLR[0][0]/N] #TP/N
yLR = [cmLR[1][1]/N] #TN/N
zLR = [cmLR[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xLR, 
                           y= yLR, 
                           z= zLR, 
                           marker = dict(color='black', size=5), showlegend=False))


#Modello DT
xDT = [cmDT[0][0]/N] #TP/N
yDT = [cmDT[1][1]/N] #TN/N
zDT = [cmDT[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xDT, 
                           y= yDT, 
                           z= zDT, 
                           marker = dict(color='black', size=5), showlegend=False))

#Per avere l'esempio di una confusion matrix 
#sono tutte relativamente buone confusion matrices 
#quindi faccio esempio di confusion matrix che uscirebbe da una classificazione non accurata 
#per far vedere la differenza 
xB = [cm_badc[0][0]/N] #TP/N
yB = [cm_badc[1][1]/N] #TN/N
zB = [cm_badc[0][1]/N] #FP/N
fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xB, 
                           y= yB, 
                           z= zB, 
                           marker = dict(color='black', size=5), showlegend=False))





fig.show()

stop = timeit.default_timer()
print('Time: ', stop - start)


#THIS IS ACTUALLY THE RESULT I WANT BUT 
#I NEED THE POINTS TO BE MORE VISIBLE

"""

In [None]:
"""
#SOLO DOT LOGREG
fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.2,
                    color='MCC'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))



#Qui aggiungo i punti per i modelli 
#Modello LogReg 
N = sum(sum(cmLR))                  #confusion matrix entries divided by N
xLR = [cmLR[0][0]/N] #TP/N
yLR = [cmLR[1][1]/N] #TN/N
zLR = [cmLR[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xLR, 
                           y= yLR, 
                           z= zLR, 
                           marker = dict(color='black', size=5), showlegend=False))


fig.show()

"""

In [None]:
"""

#QUI TUTTI E 5 I MODELLI PER VALUTARE LA DISTANZA RELATIVA 

#Qui metto tutti e 5 i modelli 
fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.2,
                    color='MCC'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))



#Qui aggiungo i punti per i modelli 
#Modello LogReg 
N = sum(sum(cmLR))                  #confusion matrix entries divided by N
xLR = [cmLR[0][0]/N] #TP/N
yLR = [cmLR[1][1]/N] #TN/N
zLR = [cmLR[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xLR, 
                           y= yLR, 
                           z= zLR, 
                           marker = dict(color='black', size=5), showlegend=False))


#Modello NB
xNB = [cmNB[0][0]/N] #TP/N
yNB = [cmNB[1][1]/N] #TN/N
zNB = [cmNB[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xNB, 
                           y= yNB, 
                           z= zNB, 
                           marker = dict(color='black', size=5), showlegend=False))

#Modello KNN
xKNN = [cmKNN[0][0]/N] #TP/N
yKNN = [cmKNN[1][1]/N] #TN/N
zKNN = [cmKNN[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xKNN, 
                           y= yKNN, 
                           z= zKNN, 
                           marker = dict(color='black', size=5), showlegend=False))

#Modello SVM
xSVM= [cmSVM[0][0]/N] #TP/N
ySVM = [cmSVM[1][1]/N] #TN/N
zSVM = [cmSVM[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xSVM, 
                           y= ySVM, 
                           z= zSVM, 
                           marker = dict(color='black', size=5), showlegend=False))

#Modello DT
xDT = [cmDT[0][0]/N] #TP/N
yDT = [cmDT[1][1]/N] #TN/N
zDT = [cmDT[0][1]/N] #FP/N


fig.add_trace(go.Scatter3d(mode='markers', 
                           x= xDT, 
                           y= yDT, 
                           z= zDT, 
                           marker = dict(color='black', size=5), showlegend=False))







fig.show()

stop = timeit.default_timer()
print('Time: ', stop - start)


#THIS IS ACTUALLY THE RESULT I WANT BUT 
#I NEED THE POINTS TO BE MORE VISIBLE


#19 secondi

"""

In [None]:
#merge the two visualisations 
#with DASH 


# First application: global landscape for classifiers 

Depending on my specific interests and application I can plot the gradient for alternative metrics which permits to appreciate the different global behaviour of the numerical indicator. Moreover through the colored tetrahedron it is easy to understand why the trustworthiness of the information provided by the Matthews Correlation Coefficient ($MCC$) is higher than other performance metrics (e.g $F_1$-score, accuracy)
 


In [None]:
#-----------HERE I AM DOING THE SAME WITH THE F1 score 
#BECAUSE IDEALLY I WOULD LIKE TO MAKE A DASH APPLICATION WHERE YOU SELECT
#THE METRIC WITH WHICH YOU WANT TO COLOUR THE TETRAHEDRON 
#AND THEN YOU ADD YOUR CONFUSION MATRIX 
#AND THE APP VISUALIZES IT FOR YOU 

import warnings
warnings.filterwarnings('ignore')
#è il solito warning dello slice che non è troppo rilevante per ora 

#F1 score
#F1 = 2*TP/(2*TP + FP + FN)
#Notice: indendent from the TN (n° of samples correctly classified as negative)
filtered['F1'] = filtered.apply(
    lambda row: round((2*row.x)/(2*row.x+row.z+(1-row.z-row.x-row.y)),3),
    axis=1)

#results:
#filtered


fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.2,
                    color='F1'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))



fig.show()

In [None]:
### ----- SAME WITH THE ACCURACY SCORE 
filtered

filtered['accuracy'] = filtered.apply(
    lambda row: round((row.x+row.y)/(row.x + row.y + row.z + (1-row.z-row.x-row.y)),3),
    axis=1)


fig = px.scatter_3d(filtered, 
                    x='x', 
                    y='y', 
                    z='z',
                    #mode='markers',
                    #marker = dict(size=12,
                    #              #color=filtered['MCC'],
                    #             colorscale='Viridis',
                    #             opacity=0.8)
                    
                    #size = 'size',
                    opacity = 0.2,
                    color='accuracy'
                   )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='circle-open'))




fig.show()
