> Name: 張宸愷
> ID: 0710018

In [144]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

#-------------------------#
# Some parameters
do_PCA = True
bin_5_classf = 1
PCA_dim = 2

#-------------------------#

df = pd.read_csv("data/student-mat.csv", sep=";")  # read csv. I used the mathematics dataset

cats = df.select_dtypes(include=[object])  # select categorical features
num = df.select_dtypes(include=[int])  # select numerical features
cats = pd.get_dummies(cats)  # onehot encode

# combine numerical and categorical data
data_orig = pd.concat([cats, num], axis=1)

if bin_5_classf == 0:

    classes = ["pass", "fail"]
    data_orig['G3'] = data_orig['G3'].apply(
        lambda x: 0 if x > 10 else 1)

elif bin_5_classf == 1:
    classes = ["I", "II", "III", "IV", "V"]

    def conv(x):
        if x >= 16:
            return 0
        if x >= 14:
            return 1
        if x >= 12:
            return 2
        if x >= 10:
            return 3

        return 4

    data_orig['G3'] = data_orig['G3'].apply(conv)



data = data_orig.to_numpy()  # get the binary classification data
# np.random.shuffle(data)
clss = len(classes)
feature_num = data.shape[1]-1
# print(data_orig.head())


In [145]:
# Decision tree

def r2np(r: dict):
    nn = np.ndarray((len(classes), 3))
    for c in range(len(classes)):
        a = str(c)
        nn[c] = [r[a]["precision"], r[a]["recall"], r["accuracy"]]

    # return [precision, recall, accuracy]
    return nn


def decision_t(X_train:np.ndarray, y_train: np.ndarray, X_test:np.ndarray, y_test: np.ndarray):
    T_cls = tree.DecisionTreeClassifier()
    T_cls.fit(X_train, y_train)
    y_pred = T_cls.predict(X_test)
    r = classification_report(y_test, y_pred, output_dict=True)
    c = confusion_matrix(y_test, y_pred)

    return r, c, T_cls


kfold = KFold(n_splits=3, shuffle=True)
T_cls = None
rs = np.zeros((len(classes), 3))
cs = np.zeros((clss, clss))
for train_index, test_index in kfold.split(data):
    train, test = data[train_index], data[test_index]
    X_train, y_train, X_test, y_test = train[:,0:-1], train[:,-1], test[:,0:-1], test[:,-1]
    pca = PCA(n_components=PCA_dim)
    if do_PCA:
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    r, confm, T_cls = decision_t(X_train, y_train, X_test, y_test)
    rs += r2np(r)
    cs += confm
# get average
rs = rs/3
cs = cs/3


# show data
rs = pd.DataFrame(rs, index=classes, columns=[
                  'precision', 'recall', 'accuracy'])
print("report classification")
print(rs)
cs = pd.DataFrame(cs, index=["true " + x for x in classes],
                  columns=["pred " + x for x in classes])
print("")
print("confusion matrix")
print(cs)


report classification
     precision    recall  accuracy
I     0.753429  0.766667  0.615159
II    0.541999  0.597222  0.615159
III   0.425926  0.447012  0.615159
IV    0.525694  0.519319  0.615159
V     0.785815  0.742848  0.615159

confusion matrix
             pred I    pred II  pred III    pred IV     pred V
true I    10.333333   2.666667  0.333333   0.000000   0.000000
true II    2.666667  12.000000  5.000000   0.333333   0.000000
true III   0.333333   5.333333  9.333333   5.333333   0.333333
true IV    0.000000   1.666667  6.000000  17.666667   9.000000
true V     0.000000   0.000000  1.000000  10.666667  31.666667


In [146]:
# Random forest

from sklearn.ensemble import RandomForestClassifier

#-------------------------#
# some parameters
num_trees = 50


#-------------------------#

R_cls = None

rs = np.zeros((clss, 3))
cs = np.zeros((clss, clss))

def random_f(X_train:np.ndarray, y_train: np.ndarray, X_test:np.ndarray, y_test: np.ndarray):
    R_cls = RandomForestClassifier(num_trees)    
    R_cls.fit(X_train, y_train)
    y_pred = R_cls.predict(X_test)
    r = classification_report(y_test, y_pred, output_dict=True)
    c = confusion_matrix(y_test, y_pred)
    return r, c, None

kfold = KFold(n_splits=3, shuffle=True)
R_cls = None
rs = np.zeros((len(classes), 3))
cs = np.zeros((clss, clss))
for train_index, test_index in kfold.split(data):
    train, test = data[train_index], data[test_index]
    X_train, y_train, X_test, y_test = train[:,0:-1], train[:,-1], test[:,0:-1], test[:,-1]
    pca = PCA(n_components=PCA_dim)
    if do_PCA:
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    r, confm, R_cls = random_f(X_train, y_train, X_test, y_test)
    rs += r2np(r)
    cs += confm
rs = rs / 3
cs = cs / 3


# show data
rs = pd.DataFrame(rs, index=classes, columns=[
                  'precision', 'recall', 'accuracy'])
print("report classification")
print(rs)
cs = pd.DataFrame(cs, index=["true " + x for x in classes],
                  columns=["pred " + x for x in classes])
print("")
print("confusion matrix")
print(cs)





report classification
     precision    recall  accuracy
I     0.770940  0.729167  0.663351
II    0.596686  0.622807  0.663351
III   0.504926  0.514245  0.663351
IV    0.578078  0.605470  0.663351
V     0.817413  0.786905  0.663351

confusion matrix
            pred I    pred II   pred III    pred IV     pred V
true I    9.666667   3.333333   0.333333   0.000000   0.000000
true II   3.000000  12.333333   4.000000   0.666667   0.000000
true III  0.000000   4.333333  11.000000   5.000000   0.333333
true IV   0.000000   0.666667   5.666667  20.333333   7.666667
true V    0.000000   0.000000   0.000000   9.333333  34.000000


In [147]:
from sklearn.neighbors import KNeighborsClassifier

#-----------------------#
# some parameters
K_n = 3

#-----------------------#


def knn_r(X_train:np.ndarray, y_train: np.ndarray, X_test:np.ndarray, y_test: np.ndarray):
    knn = KNeighborsClassifier(n_neighbors=K_n)   
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    r = classification_report(y_test, y_pred, output_dict=True)
    c = confusion_matrix(y_test, y_pred)

    return r, c, knn
    

kfold = KFold(n_splits=3, shuffle=True)
knn = None
rs = np.zeros((len(classes), 3))
cs = np.zeros((clss, clss))
for train_index, test_index in kfold.split(data):
    train, test = data[train_index], data[test_index]
    X_train, y_train, X_test, y_test = train[:,0:-1], train[:,-1], test[:,0:-1], test[:,-1]
    pca = PCA(n_components=PCA_dim)
    if do_PCA:
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    r, confm, knn = knn_r(X_train, y_train, X_test, y_test)
    rs += r2np(r)
    cs += confm
rs = rs / 3
cs = cs / 3




# show data
rs = pd.DataFrame(rs, index=classes, columns=[
                  'precision', 'recall', 'accuracy'])
print("report classification")
print(rs)
cs = pd.DataFrame(cs, index=["true " + x for x in classes],
                  columns=["pred " + x for x in classes])
print("")
print("confusion matrix")
print(cs)








report classification
     precision    recall  accuracy
I     0.835017  0.614286  0.627882
II    0.538604  0.644276  0.627882
III   0.404101  0.404539  0.627882
IV    0.565876  0.518492  0.627882
V     0.819258  0.823954  0.627882

confusion matrix
            pred I    pred II  pred III    pred IV     pred V
true I    8.000000   4.666667  0.666667   0.000000   0.000000
true II   1.666667  13.000000  4.666667   0.666667   0.000000
true III  0.000000   5.333333  8.333333   6.666667   0.333333
true IV   0.000000   2.666667  6.333333  17.666667   7.666667
true V    0.000000   0.000000  0.666667   7.000000  35.666667


# Qestions

## Q1 Decision Tree
> Show the prediction and reasoning of one arbitrary sample in the testing set. - 10%



In [None]:

fig = plt.figure(figsize=(25, 20), dpi=100)
_ = tree.plot_tree(T_cls, feature_names=data_orig.columns,
                      class_names=classes,
                       filled=True)
