# 1. Loading and reading of the cleaned dataset 

In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import random 

from sklearn.linear_model import LogisticRegression

import math
from sklearn import svm


from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
data = pd.read_csv('data/cleaned.csv', header = 0, on_bad_lines='skip')
data = data.drop(data.columns[0], axis=1)
#there are only 25 features so no need to load only a subset of the features and no need to use chunksize

In [3]:
#starting dataset
data

Unnamed: 0,age,rbcc,pcv,hemo,bp,bgr,bu,sc,sod,pot,...,pe_no,pe_yes,sg_1.005,sg_1.010,sg_1.015,sg_1.020,sg_1.025,pcc_notpresent,pcc_present,class
0,-0.205954,0.583606,0.626585,1.043279,0.250168,-0.313799,-0.414139,-0.324225,0.040817,-0.064328,...,1,0,0,0,0,1,0,1,0,1
1,-2.640637,0.000000,-0.099558,-0.452605,-1.961241,-0.313799,-0.775077,-0.400624,0.040817,-0.064328,...,1,0,0,0,0,1,0,1,0,1
2,0.625402,0.000000,-0.946724,-1.072850,0.250168,3.719296,-0.073253,-0.209627,0.040817,-0.064328,...,1,0,0,1,0,0,0,1,0,1
3,-0.205954,-0.940620,-0.825700,-0.489090,-0.486968,-0.367217,-0.013096,0.172368,-2.843192,-0.726016,...,0,1,1,0,0,0,0,0,1,1
4,-0.027806,-0.119883,-0.462629,-0.343150,0.250168,-0.514118,-0.614660,-0.286026,0.040817,-0.064328,...,1,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,0.209724,0.231861,0.989656,1.152734,0.250168,-0.060061,-0.153461,-0.457924,1.322598,0.109800,...,1,0,0,0,0,1,0,1,0,0
382,-0.562249,1.756087,1.836822,1.444614,-0.486968,-0.928112,-0.514399,-0.324225,0.361262,-0.377759,...,1,0,0,0,0,0,1,1,0,0
383,-2.343724,0.818102,1.231703,1.189219,0.250168,-0.594246,-0.614660,-0.438824,-0.065998,-0.064328,...,1,0,0,0,0,1,0,1,0,0
384,-2.046812,1.404343,1.473751,0.605459,-1.224104,-0.407281,-0.133409,-0.362425,-0.279629,0.109800,...,1,0,0,0,0,0,1,1,0,0


# 1. defining vector of target and vector of features

In [4]:
#target variable
y = data['class']
y

#predictors
x = data.copy()
x.drop('class', inplace=True, axis=1)
x

Unnamed: 0,age,rbcc,pcv,hemo,bp,bgr,bu,sc,sod,pot,...,ba_present,pe_no,pe_yes,sg_1.005,sg_1.010,sg_1.015,sg_1.020,sg_1.025,pcc_notpresent,pcc_present
0,-0.205954,0.583606,0.626585,1.043279,0.250168,-0.313799,-0.414139,-0.324225,0.040817,-0.064328,...,0,1,0,0,0,0,1,0,1,0
1,-2.640637,0.000000,-0.099558,-0.452605,-1.961241,-0.313799,-0.775077,-0.400624,0.040817,-0.064328,...,0,1,0,0,0,0,1,0,1,0
2,0.625402,0.000000,-0.946724,-1.072850,0.250168,3.719296,-0.073253,-0.209627,0.040817,-0.064328,...,0,1,0,0,1,0,0,0,1,0
3,-0.205954,-0.940620,-0.825700,-0.489090,-0.486968,-0.367217,-0.013096,0.172368,-2.843192,-0.726016,...,0,0,1,1,0,0,0,0,0,1
4,-0.027806,-0.119883,-0.462629,-0.343150,0.250168,-0.514118,-0.614660,-0.286026,0.040817,-0.064328,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,0.209724,0.231861,0.989656,1.152734,0.250168,-0.060061,-0.153461,-0.457924,1.322598,0.109800,...,0,1,0,0,0,0,1,0,1,0
382,-0.562249,1.756087,1.836822,1.444614,-0.486968,-0.928112,-0.514399,-0.324225,0.361262,-0.377759,...,0,1,0,0,0,0,0,1,1,0
383,-2.343724,0.818102,1.231703,1.189219,0.250168,-0.594246,-0.614660,-0.438824,-0.065998,-0.064328,...,0,1,0,0,0,0,1,0,1,0
384,-2.046812,1.404343,1.473751,0.605459,-1.224104,-0.407281,-0.133409,-0.362425,-0.279629,0.109800,...,0,1,0,0,0,0,0,1,1,0


# 2. rationale, creation of M1 & M2, generation of the points

The idea here is to use the CT to visually compare the performance of a classifier in
two different resampling strategies, with and without stratification. Thus, first we build
(or find on the UCI for instance) an unbalanced dataset D, with XX samples of class 0
and YY samples of class 1

In [6]:
#definiamo XX =samples of class 0
#and       YY = samples of class 1

S = data.shape[0]

XX = data[data['class']==0].shape[0] 
print("Portion of negatives:", round(XX/S,2))
YY = data[data['class']==1].shape[0]
print("Portion of positives:", round(YY/S,2))

print("Se ho tempo ne cerco uno più sbilanciato ma intanto facciamo così")


Portion of negatives: 0.39
Portion of positives: 0.61
Se ho tempo ne cerco uno più sbilanciato ma intanto facciamo così


Then we build two sets M1, M2 of Montecarlo CV sets, each
consisting of 100 random splits of the original dataset D: M1 = {(T1i ,V 1i ) : 1 ≤i ≤100}
and M2 = {(T2i ,V 2i ) : 1 ≤ i ≤ j}, such that, for each i, Tki includes a random 75% (or a
different percentage) of D and V ki the remaining 25%, with the difference that, for k = 1
the split is completely random (not stratified), while for k = 2 the split is stratified, thus in
both T2i and V 2i the ratio between the samples of class 0 and the samples of class 1 is the
same as in the original dataset D.

Then we train a model (SVM, RF, etc) on $T^k_i$ and we apply it on $V^k_i$, computing the resulting performance (e.g., by MCC) on $V^k_i$.
Thus we end up with 100 values of MCC for the splits $M_1$ and 100 values of MCC for the splits $M_2$: now we put these points on the CT, and we should results in two clusters of points well separated, and those for the stratified case should be much better, in terms of performance, than those for the non stratified.

In [7]:

#quindi mi devo creare due set: 
# M1 - T_i,V_i top1 rispettivamente 75% e 25% -> 100 splits dello stesso dataset MA *stratified* (= mantiene proporzione tra 0 e 1 among training and validation)


# M2 - T_i,V_i top2 rispettivamente 75% e 25% -> 100s splits dello stesso dataset MA *random* (=non mantiene proporzione tra 0 e 1 among training and validation)


#èer ciascuno dei 200 datasets alleno un modello - >ottengo 200 confusion matrices -> queste confusion matrices le turn into ct coordinates 

#una volta ottenute le coordinate ci calcolo mcc e dovrò vedere che quelli dello stratified hanno tutti stesso colore 
#del MCC più alto di tutti i 1000 punti associati al random che avranno colore più basso 



In [8]:
#run Montecarlo simulation

#from sklearn.model_selection import ShuffleSplit
#from sklearn.model_selection import StratifiedShuffleSplit    #returns stratified randomized folds

#inizio con 10 split poi aumento


In [5]:
#mi definisco la funzione che poi uso a ogni fit del modello per confrontare predizioni con actual 

def generate_matrix_entries(y_actual, y_hat): #per generarmi la matrice in ogni volta che giro 
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:                #se prediction = actual = 1 -> TP
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:   #se prediction = 1 but different from the actual -> FP
           FP += 1
        if y_actual[i]==y_hat[i]==0:                #se prediction = actual = 0 -> TN
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:   #se prediction = 0 but different from the actual -> FN
           FN += 1

    return (TN,FP,FN,TP)

## 2.1 Coordinates for the 100 models with STRATIFIED resampling  

if stratified k= 2 -> both training and validation the ratio between the samples of class 0 and the sample 1 is the same as in the original dataset ----> those for stratified case should be much better in terms of performance than non stratified

In [14]:
# M1 - T_i,V_i top1 rispettivamente 75% e 25% -> 100 splits dello stesso dataset MA *stratified* (= mantiene proporzione tra 0 e 1 among training and validation)

ss = StratifiedShuffleSplit(n_splits=2, test_size=.25, random_state=0) #stratified -> M1


In [15]:
#CON LINEAR REGRESSION 

ss_nCM =[]

ss_MCC =[]



for strat_train_index, strat_test_index in ss.split(x, y): 
    strat_train_set_x = x.loc[strat_train_index]  
    strat_test_set_x = x.loc[strat_test_index]
    
    strat_train_set_y = y.loc[strat_train_index]
    strat_test_set_y = y.loc[strat_test_index]
    
 
    logreg = LogisticRegression()


    logreg.fit(strat_train_set_x, strat_train_set_y)

    y_pred = np.array(logreg.predict(strat_test_set_x))
    
    (TN, FP, FN, TP) = generate_matrix_entries(strat_test_set_y.values, y_pred)
    
    N = sum([TN, FP, FN, TP])
    each_CM = np.array([TN, FP,FN, TP])

    each_nCM = each_CM/N
    ss_nCM.append(each_nCM)
    
    
    each_MCC = (TP*TN -FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    ss_MCC.append(each_MCC)
    
    
#ss_nCM #qui ho le due (che poi saranno 100 nCM)

PD_ss_nCM = pd.DataFrame(ss_nCM) 


SS_coords = PD_ss_nCM.drop([2],axis=1) #coordinata di FN non ci interessa

SS_coords.rename({0: 'TN', 1: 'FP', 3: 'TP'}, axis=1, inplace=True)

SS_coords['MCC']= ss_MCC
                                        
SS_coords

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,TN,FP,TP,MCC
0,0.381443,0.0,0.587629,0.937417
1,0.371134,0.010309,0.608247,0.956306


In [17]:
#CON SUPPORT VECTOR MACHINES - esce finchè sono con k=5 appena salgo crasha 
#per stratified riesce anche con 10 ma non per random 


"""
ss_nCM =[]

ss_MCC =[]



for strat_train_index, strat_test_index in ss.split(x, y): 
    strat_train_set_x = x.loc[strat_train_index]  
    strat_test_set_x = x.loc[strat_test_index]
    
    strat_train_set_y = y.loc[strat_train_index]
    strat_test_set_y = y.loc[strat_test_index]
    

    vect = svm.SVC(kernel='linear')

    vect.fit(strat_train_set_x, strat_train_set_y)

    y_pred = np.array(vect.predict(strat_test_set_x))
    
    (TN, FP, FN, TP) = generate_matrix_entries(strat_test_set_y.values, y_pred)
    
    each_CM = np.array([TN, FP,FN, TP])

    each_nCM = each_CM/N
    ss_nCM.append(each_nCM)
    
    
    each_MCC = (TP*TN -FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    ss_MCC.append(each_MCC)
    
    
#ss_nCM #qui ho le due (che poi saranno 100 nCM)

PD_ss_nCM = pd.DataFrame(ss_nCM) 


SS_coords = PD_ss_nCM.drop([2],axis=1) #coordinata di FN non ci interessa

SS_coords.rename({0: 'TN', 1: 'FP', 3: 'TP'}, axis=1, inplace=True)

SS_coords['MCC']= ss_MCC
                                        
SS_coords

"""

Unnamed: 0,TN,FP,TP,MCC
0,0.093264,0.002591,0.15285,0.956306
1,0.090674,0.005181,0.150259,0.912613
2,0.093264,0.002591,0.15544,0.978275
3,0.082902,0.012953,0.15544,0.893497
4,0.093264,0.002591,0.15285,0.956306
5,0.085492,0.010363,0.15544,0.914412
6,0.090674,0.005181,0.150259,0.912613
7,0.095855,0.0,0.147668,0.937417
8,0.093264,0.002591,0.15285,0.956306
9,0.090674,0.005181,0.15544,0.956782


In [None]:
#DOMANDA: dal dataset tolgo il test set ultimo finale? 

#Io per ora ho utilizzato il dataset iniziale intero (distinguendo features e target)
#e questo poi l'ho diviso in 100 splits in maniera stratified per M1 e random per M2
#da capire se andava tolto 

In [18]:
"""

for str_train_index, str_test_index in ss.split(x, y): #returns 2 list of all the splits for X and Y (am)
    str_train_set = x.loc[str_train_index]  #loc=rows which are having the index (vs iloc che fa columns)
    str_test_set = x.loc[str_test_index]
    
    print(len(str_test_set.value_counts())) #-> 97 perchè 25% di 386
    print(len(str_train_set.value_counts())) #-> 289 perchè 75% di 386
    
#quindi questi sono i miei 10(poi 100) splits del dataset --- qui stratified - ora stessa cosa per random

"""

'\n\nfor str_train_index, str_test_index in ss.split(x, y): #returns 2 list of all the splits for X and Y (am)\n    str_train_set = x.loc[str_train_index]  #loc=rows which are having the index (vs iloc che fa columns)\n    str_test_set = x.loc[str_test_index]\n    \n    print(len(str_test_set.value_counts())) #-> 97 perchè 25% di 386\n    print(len(str_train_set.value_counts())) #-> 289 perchè 75% di 386\n    \n#quindi questi sono i miei 10(poi 100) splits del dataset --- qui stratified - ora stessa cosa per random\n\n'

## 2.2 Coordinates for the 100 models with RANDOM resampling  

if random k=1 -> the split is random (does not keep the ratio betwen the sample of class 0 and samples of class 1 among training and validation)

In [19]:
# M2 - T_i,V_i top2 rispettivamente 75% e 25% -> 100s splits dello stesso dataset MA *random* (=non mantiene proporzione tra 0 e 1 among training and validation)

rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=0) #random  -> M2


In [2]:
"""

for rand_train_index, rand_test_index in rs.split(x, y): #returns 2 list of all the splits for X and Y (am)
    rand_train_set = x.loc[rand_train_index]  #loc=rows which are having the index (vs iloc che fa columns)
    rand_test_set = x.loc[rand_test_index]
    
    print(len(rand_test_set.value_counts())) #-> 97 perchè 25% di 386
    print(len(rand_train_set.value_counts())) #-> 289 perchè 75% di 386 
    
#quindi questi sono i miei 10(poi 100) splits del dataset --- qui randomici - now off to modelli

"""

'\n\nfor rand_train_index, rand_test_index in rs.split(x, y): #returns 2 list of all the splits for X and Y (am)\n    rand_train_set = x.loc[rand_train_index]  #loc=rows which are having the index (vs iloc che fa columns)\n    rand_test_set = x.loc[rand_test_index]\n    \n    print(len(rand_test_set.value_counts())) #-> 97 perchè 25% di 386\n    print(len(rand_train_set.value_counts())) #-> 289 perchè 75% di 386 \n    \n#quindi questi sono i miei 10(poi 100) splits del dataset --- qui randomici - now off to modelli\n\n'

In [None]:
rs_nCM =[]

rs_MCC =[]



for rand_train_index, rand_test_index in rs.split(x, y): 
    rand_train_set_x = x.loc[rand_train_index]  
    rand_test_set_x = x.loc[rand_test_index]
    
    rand_train_set_y = y.loc[rand_train_index]
    rand_test_set_y = y.loc[rand_test_index]
    

    vect = svm.SVC(kernel='linear')

    vect.fit(rand_train_set_x, rand_train_set_y)

    y_pred = np.array(vect.predict(rand_test_set_x))
    
    (TN, FP, FN, TP) = generate_matrix_entries(rand_test_set_y.values, y_pred)
    
    each_CM = np.array([TN, FP,FN, TP])

    each_nCM = each_CM/N
    rs_nCM.append(each_nCM)
    
    
    each_MCC = (TP*TN -FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    rs_MCC.append(each_MCC)
    
    
    
rs_nCM #qui ho le due (che poi saranno 100 nCM)

PD_rs_nCM = pd.DataFrame(rs_nCM) 


RS_coords = PD_rs_nCM.drop([2],axis=1) #coordinata di FN non ci interessa

RS_coords.rename({0: 'TN', 1: 'FP', 3: 'TP'}, axis=1, inplace=True)

RS_coords['MCC']= rs_MCC
                                        
RS_coords

# 3. Plot dei risultati nel tetraedro

Una volta che ottengo le coordinate e gli MCC per ciascun punto qui è già pronto il codice per plottare il risultato, il problema è riuscire a ottenere le 200 confusion matices e(e quindi 200 punti nel tetraedro)

In [None]:
#QUESTO POI HA GIà PRONTO IL CODICE PER PLOTTARE IL RISULTATO 


fig = go.Figure(data=go.Scatter3d(mode = 'markers',
    x= coords['TP'], 
    y= coords['TN'], 
    z=coords['FP'],
    marker=dict(
        size=3, #pallini per quelli assocaiti al threshold quindi in realtà
        color=coord['MCC'],
        colorscale='Viridis',
    ),
    line=dict(
        color='green',
        width=3
    )
))


fig.update_layout(
    width=800,
    height=700,
    autosize=False
)


fig.add_trace(go.Figure(data=[
    go.Mesh3d(
        x=[0, 0, 0, 1], 
        y=[0, 0, 1, 0],
        z=[0, 1, 0, 0],

        color ='lightgrey',
        
        opacity = 0.2, #for transparency
        
        # i, j and k sono i vertici dei triangoli
        # here we represent the 4 triangles of the tetrahedron surface
        i=[0, 0, 0, 1],
        j=[1, 2, 3, 2],
        k=[2, 3, 1, 3],
        name='y',
        showscale=True
    )
]).data[0])



fig.show()

Eventually, I should obtain that the points for the 100 models trained on the 100 splits of the dataset are: \n
* better clustered for the case in which te 100 splits have been stratified (i.e same proportion of 0/1 among training and validation) 
* less clustered for the random case 
