# COGS 118A Final Project
Binh Nguyen

In [4]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import string
import csv
%matplotlib inline

Datasets acquired from UCI Machine Learning Repository:
https://archive.ics.uci.edu/ml/datasets

## Load datasets


### 1. LETTER Dataset
https://archive.ics.uci.edu/ml/datasets/letter+recognition

In [24]:
# Size = 20000x17
# N = 20000
# D = 16 features
data_letter = np.genfromtxt('letter-recognition.data', delimiter = ',', dtype=None, encoding=None)
n_letter = np.size(data_letter)
d_letter = 16
X_letter = np.zeros((n_letter,d_letter))

# Create X Matrix from numpy.void type
for i in range(n_letter):
    for j in range(d_letter):
        X_letter[i][j] = data_letter[i][j+1]
        
# Create y vector with labels for each letter
y_letter = []
for i in range(n_letter):
    y_letter.append(data_letter[i][0])

# Convert letters to positive and negative values

# A-M as positive (+) = 1
positive = list(string.ascii_uppercase)[0:13]

# N-Z as negative (-) = 0
negative = list(string.ascii_uppercase)[13:26]

y_letter = np.asarray(y_letter)

for i in range(len(y_letter)):
    for j in range(len(positive)):
        if y_letter[i] == positive[j]:
            y_letter[i] = 1
        elif y_letter[i] == negative[j]:
            y_letter[i] = 0

y_letter = y_letter.astype(int)

### 2. Indian Pines dataset
http://www.ehu.eus/ccwintco/index.php?title=Hyperspectral_Remote_Sensing_Scenes


In [None]:
# 145 x 145 x 200 multi-dimensional array
# 145 x 145 pixel images
# 200 samples
# class 11: Soybean-minmill as positive, rest as negative
mat = sio.loadmat('Indian_pines_corrected.mat')
values_ip = mat.values()
X_ip = values_ip[1][0:][0:]

# Load ground truth (y labels)
mat2 = sio.loadmat('Indian_pines_gt.mat')
values2_ip = mat2.values()
y_ip = values2_ip[2][0:][0:]

# Convert class 11 (Soybean-mintill) labels in ground truth as positive
y_ip[y_ip[0:] == 11] = 1
y_ip[y_ip[0:] != 1] = 0

# Convert 3-D array to 2-D array
X_ip = X_ip.transpose(2,0,1).reshape(21025, -1)
y_ip = y_ip.reshape(-1)

### Yeast dataset
https://archive.ics.uci.edu/ml/datasets/Yeast

In [None]:
# n = 1484 datapoints
# d = 8 features
data_yeast = np.genfromtxt('yeast.data.csv', delimiter = ',', dtype=None, encoding=None)
n_yeast = np.size(data_yeast)
d_yeast = 8
X_yeast = np.zeros((n_yeast,d_yeast))

# Create X Matrix from numpy.void type
for i in range(n_yeast):
    for j in range(d_yeast):
        X_yeast[i][j] = data_yeast[i][j+1]
        
# Create y vector with labels for each letter
y_yeast = []
for i in range(n_yeast):
    y_yeast.append(data_yeast[i][d_yeast+1])
    
not_nuclear = ['CYT','MIT','ME3','ME2','ME1','EXC','VAC','POX','ERL']

y_yeast = np.asarray(y_yeast)

for i in range(len(y_yeast)):
    for j in range(len(not_nuclear)):
        if y_yeast[i] == 'NUC':
            y_yeast[i] = 1
        elif y_yeast[i] == not_nuclear[j]:
            y_yeast[i] = 0

y_yeast = y_yeast.astype(int)

## Model Functions

### K-Fold Cross validation

In [49]:
def CV(X, y, folds, test_size, model, values):
    ''' 
    K-Fold Cross Validation:
    values = parameters to test in 1-dimensional array
        -e.g. values = [1, 10, 100, 100]
    X = training data
        -e.g X = X_letter
    folds = k number of folds
        -e.g. folds = 5 % for 5-fold CV
    test_size = percent of training data to be tested
        -e.g. test_size = 0.2 % 20% of training data as validation set
    model = scikit.learn classifier function
        -e.g. model= BAG_DT(values, max_samp, max_feat)
            % For bagging decision tree
    '''
    n = len(X)
    kf = KFold(n_splits = folds)
    splits = kf.get_n_splits()
    optimal_p = values[0]
    count = 0
    avg_acc = np.zeros(len(values))
    avg_train_acc = np.zeros(len(values))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    
    # Iterate over K-Fold cross validation
    for i in values:
        fold = 0
        score_test = np.zeros(splits)
        score_train = np.zeros(splits)
        print "\np =", i
        
    # Test Validation error for each fold
        for train_idx, test_idx in kf.split(X_train):
            fold += 1
            X_train_set, X_test_set = X_train[train_idx], X_train[test_idx]
            y_train_set, y_test_set = y_train[train_idx], y_train[test_idx]
            
            clf = model
            clf = clf.fit(X_train_set, y_train_set)
            
            # Save accuracy to vector
            score_train[fold - 1] = clf.score(X_train_set, y_train_set)
            score_test[fold - 1] = clf.score(X_test_set, y_test_set)
            
        avg_train_acc[count] = np.average(score_train)
        avg_acc[count] = np.average(score_test)
        
        print "Avg training accuracy: %f" % (avg_train_acc[count])
        print "Avg validation accuracy %f" % (avg_acc[count])
        count += 1
    
    index_optimal_p = avg_acc.tolist().index(max(avg_acc))
    optimal_p = values[index_optimal_p]
    
    print "\nOptimal p:", optimal_p
    print "Best validation accuracy:", np.amax(avg_acc)

### Bagging Decision Tree

In [30]:
def BAG_DT(i, max_samp, max_feat):
    clf = BaggingClassifier(
        DecisionTreeClassifier(max_depth = i),
            max_samples = max_samp,
            max_features = max_feat)
    return clf

In [47]:
values = [1, 10, 100, 1000]
CV(X_letter, y_letter, 2, 0.2, BAG_DT(i, 0.5, 0.5), values)


p = 1
Avg training accuracy: 0.984375
Avg validation accuracy 0.914187

p = 10
Avg training accuracy: 0.982500
Avg validation accuracy 0.912625

p = 100
Avg training accuracy: 0.982812
Avg validation accuracy 0.911188

p = 1000
Avg training accuracy: 0.985563
Avg validation accuracy 0.916500

Optimal p: 1000
Best validation accuracy: 0.9165000000000001


### Artificial Neural Network

In [53]:
def ANN(hidden_units, i):
    clf = MLPClassifier(hidden_units ,alpha = 1)
    return clf

In [54]:
values = [0,0.2,0.5,0.9]
hidden_units = [100]
CV(X_letter, y_letter, 2, 0.2, ANN(hidden_units, i), values)

[0, 0.2, 0.5, 0.9]

p = 0
Avg training accuracy: 0.889875
Avg validation accuracy 0.886813

p = 0.2
Avg training accuracy: 0.893188
Avg validation accuracy 0.888313

p = 0.5
Avg training accuracy: 0.886562
Avg validation accuracy 0.883938

p = 0.9
Avg training accuracy: 0.885375
Avg validation accuracy 0.879375

Optimal p: 0.2
Best validation accuracy: 0.8883125000000001
