In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [9]:
glass = pd.read_csv('glass.csv')
glass.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [3]:
# Dividir o dados para treinamento e teste
train, test = train_test_split(glass, test_size=.4, random_state=123)

print('train {0}'.format(train.shape))
print('test {0}'.format(test.shape))

X_train = train.drop(['Type'], axis=1)
y_train = train['Type']

X_test = test.drop(['Type'], axis=1)
y_test = test['Type']

print('X_train {0}'.format(X_train.shape))
print('y_train {0}'.format(y_train.shape))
print('X_test {0}'.format(X_test.shape))
print('y_test {0}'.format(y_test.shape))

train (128, 10)
test (86, 10)
X_train (128, 9)
y_train (128,)
X_test (86, 9)
y_test (86,)


In [4]:
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.74      0.72        23
           2       0.67      0.90      0.76        29
           3       0.00      0.00      0.00         6
           5       0.75      0.38      0.50         8
           6       1.00      0.50      0.67         4
           7       0.81      0.81      0.81        16

   micro avg       0.71      0.71      0.71        86
   macro avg       0.66      0.55      0.58        86
weighted avg       0.68      0.71      0.68        86



In [5]:
# K-fold cross validation
# k=10 sobre o set de treinamento, bagging
X = train.drop(['Type'], axis=1)
y = train.Type
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print('accuracy:', scores)
print('avg accuracy:', scores.mean())



accuracy: [0.75       0.73333333 0.73333333 0.78571429 0.71428571 0.5
 0.75       0.7        0.6        0.6       ]
avg accuracy: 0.6866666666666666


In [6]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# avaliando o algoritmo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.70      0.68        23
           2       0.68      0.72      0.70        29
           3       0.40      0.33      0.36         6
           5       0.83      0.62      0.71         8
           6       0.60      0.75      0.67         4
           7       0.93      0.88      0.90        16

   micro avg       0.71      0.71      0.71        86
   macro avg       0.69      0.67      0.67        86
weighted avg       0.71      0.71      0.71        86



In [7]:
# K-fold cross validation
# k=10 sobre o set de treinamento, clf
X = train.drop(['Type'], axis=1)
y = train.Type
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('accuracy:', scores)
print('avg accuracy:', scores.mean())

accuracy: [0.75       0.46666667 0.8        0.64285714 0.42857143 0.33333333
 0.66666667 0.7        0.6        0.5       ]
avg accuracy: 0.5888095238095238




In [16]:
from sklearn.metrics import confusion_matrix

# Haciendo la matriz de confusion binaria para la clase i
def MultiClassConfusionMatrix(y_true,y_pred):
    
    C= np.unique(y_true)
    D=len(C)
    
    # Matriz de confusion general 
    CM=confusion_matrix(y_true, y_pred)
    #print('###### General Confusion Matrix #####')
    #print(CM)
        
    accuracy=np.zeros(D)
    precision=np.zeros(D)
    recall=np.zeros(D)
    specificity=np.zeros(D)
    clase = []#np.zeros(D, dtype = int)
    
    
    for i in range(D):
        #atrib=np.array(C)
        #print('aquiii')
        #print(C)
        atributo=C[i]
        row_i=CM[i,:]
        col_i=CM[:,i]
        
        row_i_without_i=np.delete(row_i,i,0)
        #print(row_i_without_i)
        col_i_without_i=np.delete(col_i,i,0)
        del_row_i=np.delete(CM,i,0)
        del_col_i=np.delete(del_row_i,i,1)
        
        VP=CM[i,i]
        #print(VP)
        FN=np.sum(row_i_without_i)
        #print(VN)
        FP=np.sum(col_i_without_i)
        VN=np.sum(del_col_i)
#         print('VP VN FP FN', VP, VN,FP,FN )
        
        CM_new=[[VP,FN],[FP,VN]]
        #print(CM_new)
        CM_new=np.array(CM_new) # casting

        # calculando las medidas de desempenho
        div1=VP+VN+FP+FN
        #print(div1)
        div2=VP+FP
        div3=VP+FN
        div4=VN+FP
        
        accuracy[i]=save_value((VP+VN),div1)
        precision[i]=save_value(VP,div2)
        recall[i]=save_value(VP,div3)
        specificity[i]=save_value(VN,div4)        
        clase.append(atributo)
        
        
        #print('###### Confusion Matrix para clase ',atributo, ' #####')
        #print(CM_new)    
        
    Table = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'specificity': specificity, 'clase': clase}
    df = pd.DataFrame(data=Table)
    print(df)
    aux = df.sum(axis = 0, skipna = True)
    #print('df.shape')
    #print(df.shape[0])
    mean = aux[0] / df.shape[0]
    print('   mean accuracy: ', mean)
    #print(accuracy.shape)
#     return accuracy
    #print(df)

In [17]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


glass = pd.read_csv('glass.csv')
glass_data = glass.values
#X = df.iloc[:, 0:4]
#y = df.iloc[:, 4]

y_glass = glass_data[:, 9]
x_glass = glass_data[:, 0:9]
#y_glass = glass.iloc[:, 10]
#x_glass = glass.iloc[:, 1:10]
#x_glass_data = x_glass.values
#y_glass_data = y_glass.values


best_accuracy_f = 0
best_pred_f     = []
best_target_f   = []

best_accuracy_t = 0
best_pred_t     = []
best_target_t   = []

kf = KFold(n_splits = 10, shuffle = True)

for train_index, test_index in kf.split(x_glass):
    #x_train, x_test = x_glass.iloc[train_index], x_glass.iloc[test_index]
    #y_train, y_test = y_glass.iloc[train_index], y_glass.iloc[test_index]
    
    x_train, x_test = x_glass[train_index], x_glass[test_index]
    y_train, y_test = y_glass[train_index], y_glass[test_index]
    
    model = XGBClassifier()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    current_accuracy = accuracy_score(y_test, y_pred)
    if current_accuracy > best_accuracy_f:
        best_accuracy_f = current_accuracy
        best_pred_f     = y_pred
        best_target_f   = y_test
    
    
    t_clf = DecisionTreeClassifier(criterion = 'gini')
    t_clf.fit(x_train, y_train)
    y_pred_t = t_clf.predict(x_test)        
    current_accuracy_t = accuracy_score(y_test, y_pred_t)
    
    if current_accuracy_t > best_accuracy_t:
        best_accuracy_t = current_accuracy        
        best_pred_t     = y_pred_t
        best_target_t   = y_test

        
print('SALIDA RANDOM FOREST')
MultiClassConfusionMatrix(best_target_f, best_pred_f)

print('SALIDA ONLY TREE')
MultiClassConfusionMatrix(best_target_t, best_pred_t)

SALIDA RANDOM FOREST


NameError: name 'save_value' is not defined