In [24]:
from sklearn import tree
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [22]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [34]:
datasets = {
    "postop" : pd.read_csv("post-operative.data", header=None),
    "cmc" : pd.read_csv("cmc.data", header=None),
    "adult" : pd.read_csv("adult.data", header=None)
}

for key in datasets:
    
    dataset = datasets[key]
    

    # Ignore whitespace in obs
    for col in dataset.columns:
        if dataset[col].dtype == type(object):
            le = preprocessing.LabelEncoder()
            dataset[col] = dataset[col].str.strip()  
            dataset[col] = le.fit_transform(dataset[col])
    
    print(key)
    print(dataset.head())
    
    if(key == "adult"):
        train = datasets["adult"]
        test = pd.read_csv("adult.test", header=None)
        for col in test.columns:
            if test[col].dtype == type(object):
                le = preprocessing.LabelEncoder()
                test[col] = dataset[col].astype(str).str.strip()  
                test[col] = le.fit_transform(test[col])
    else:
        train, test = train_test_split(dataset, test_size=0.2)
    
    train_X, train_y = train.values[:,:-1], train.values[:,-1]
    test_X, test_y = test.values[:, :-1], test.values[:,-1]
    
    clf = tree.DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf = clf.fit(train_X, train_y)
    
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(train_X, train_y)
    
    score = clf.score(test_X, test_y)
    print("ID3 : ", score)
    
    knnScore = knn.score(test_X, test_y)
    print("KNN : ", knnScore)
    
    id3Pred = clf.predict(test_X)
    knnPred = knn.predict(test_X)
    
    plot_confusion_matrix(test_y, id3Pred, classes=[[1,2,3,4]], normalize=True,
                      title='Normalized confusion matrix - Decision-Tree-Scikit - {}'.format(key))
    plt.show()
    
    plot_confusion_matrix(test_y, knnPred, classes=np.unique(dataset.values[:,-1]), normalize=True,
                      title='Normalized confusion matrix - KNN-Scikit - {}'.format(key))
    plt.show()

postop
   0  1  2  3  4  5  6  7  8
0  2  1  0  2  0  1  1  3  0
1  2  0  0  0  0  1  1  2  2
2  0  1  0  0  0  1  0  2  0
3  2  1  1  0  0  2  0  3  0
4  2  2  0  0  0  1  1  2  0
ID3 :  0.8333333333333334
KNN :  0.7777777777777778


TypeError: only integer scalar arrays can be converted to a scalar index