In [1]:
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm

import MLutils as ml

In [2]:
# Load data
columns = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our',
           'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order',
           'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people',
           'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business',
           'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your',
           'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl',
           'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
           'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
           'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm',
           'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original',
           'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table',
           'word_freq_conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!',
           'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest',
           'capital_run_length_total']

spambase = pd.read_csv('spambase.data', header=None, delim_whitespace=False, names=columns + ['spam'])

In [3]:
# Display data
display(spambase)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [4]:
# Normalize data without labels
spambase.iloc[:, :-1] = ml.normalize(spambase.iloc[:, :-1])[0]

In [5]:
# Split data using 80/20 split sklearn function
spam_train, spam_test = model_selection.train_test_split(spambase, test_size=0.2, random_state=0)

# Problem 1

## sklearn SVM Spambase

In [6]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    clf = svm.SVC(kernel=kernel)
    clf.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
    y_pred = clf.predict(spam_test.iloc[:, :-1])
    print('Kernel: ' + kernel)
    print('Accuracy: ' + str(metrics.accuracy_score(spam_test.iloc[:, -1], y_pred)))
    print('Precision: ' + str(metrics.precision_score(spam_test.iloc[:, -1], y_pred)))
    print('Recall: ' + str(metrics.recall_score(spam_test.iloc[:, -1], y_pred)))
    print('F1: ' + str(metrics.f1_score(spam_test.iloc[:, -1], y_pred)))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(spam_test.iloc[:, -1], y_pred))
    print()

Kernel: linear
Accuracy: 0.9120521172638436
Precision: 0.9081081081081082
Recall: 0.8772845953002611
F1: 0.8924302788844622
Confusion Matrix:
[[504  34]
 [ 47 336]]
Kernel: poly
Accuracy: 0.7752442996742671
Precision: 0.9536082474226805
Recall: 0.4830287206266319
F1: 0.6412478336221837
Confusion Matrix:
[[529   9]
 [198 185]]
Kernel: rbf
Accuracy: 0.9261672095548317
Precision: 0.9315068493150684
Recall: 0.8877284595300261
F1: 0.9090909090909091
Confusion Matrix:
[[513  25]
 [ 43 340]]

Kernel: sigmoid
Accuracy: 0.8794788273615635
Precision: 0.8636363636363636
Recall: 0.8433420365535248
F1: 0.8533685601056803
Confusion Matrix:
[[487  51]
 [ 60 323]]


## sklearn SVM Digits

In [2]:
# Load data
training_image = pd.read_csv('training_image.txt', header=None)
print(training_image.shape)
training_label = pd.read_csv('training_label.txt', header=None)
print(training_label.shape)
testing_image = pd.read_csv('testing_image.txt', header=None)
print(testing_image.shape)
testing_label = pd.read_csv('testing_label.txt', header=None)
print(testing_label.shape)

combined_training = pd.concat([training_image, training_label], axis=1)
combined_testing = pd.concat([testing_image, testing_label], axis=1)
digits = pd.concat([combined_training, combined_testing])

(60000, 200)
(60000, 1)
(10000, 200)
(10000, 1)


In [3]:
# Display data
display(digits)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,0.1
0,0.0,0.0,-1.0,-1.0,1.0,-29.0,-10.0,-2.0,0.0,0.0,...,-9.0,0.0,0.0,0.0,0.0,8.0,-10.0,-7.0,-5.0,5
1,0.0,0.0,-1.0,-1.0,-4.0,-24.0,-8.0,-2.0,0.0,0.0,...,-2.0,-1.0,-1.0,0.0,0.0,-7.0,-1.0,4.0,8.0,0
2,0.0,0.0,-6.0,-6.0,13.0,-37.0,-1.0,5.0,0.0,0.0,...,15.0,-8.0,0.0,-2.0,0.0,16.0,6.0,14.0,2.0,4
3,0.0,0.0,0.0,0.0,-24.0,-20.0,0.0,0.0,0.0,0.0,...,-7.0,1.0,5.0,0.0,0.0,-20.0,-6.0,-12.0,-4.0,1
4,0.0,0.0,2.0,-16.0,-14.0,-24.0,2.0,-4.0,0.0,0.0,...,-13.0,-6.0,4.0,1.0,-1.0,-14.0,-6.0,4.0,-8.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,11.0,-39.0,-9.0,-1.0,0.0,0.0,...,-6.0,1.0,3.0,0.0,0.0,-4.0,-12.0,5.0,-13.0,2
9996,0.0,0.0,-4.0,-4.0,39.0,-41.0,-9.0,-1.0,0.0,0.0,...,-9.0,0.0,0.0,0.0,0.0,29.0,-7.0,17.0,-5.0,3
9997,0.0,0.0,-2.0,-4.0,-4.0,-38.0,-4.0,-6.0,-4.0,2.0,...,-2.0,-3.0,5.0,0.0,0.0,-1.0,1.0,12.0,-8.0,4
9998,0.0,0.0,-2.0,-2.0,-23.0,-1.0,-6.0,-4.0,0.0,0.0,...,-12.0,0.0,2.0,0.0,0.0,-8.0,-4.0,12.0,0.0,5


In [4]:
clf = svm.SVC(kernel='linear')
clf.fit(training_image, training_label)
y_pred = clf.predict(testing_image)
print('Kernel: linear')
print('Accuracy: ' + str(metrics.accuracy_score(testing_label, y_pred)))
print('Precision: ' + str(metrics.precision_score(testing_label, y_pred, average='macro')))
print('Recall: ' + str(metrics.recall_score(testing_label, y_pred, average='macro')))
print('F1: ' + str(metrics.f1_score(testing_label, y_pred, average='macro')))
print('Confusion Matrix:')
print(metrics.confusion_matrix(testing_label, y_pred))

  y = column_or_1d(y, warn=True)


Kernel: linear
Accuracy: 0.9343
Precision: 0.9338497397132699
Recall: 0.9332721073664768
F1: 0.933444419508511
Confusion Matrix:
[[ 952    0    0    3    0   11    8    1    3    2]
 [   0 1114    2    1    0    2    4    0   12    0]
 [   7    6  965    9    6    3    9    7   18    2]
 [   4    5   14  936    1   17    1    5   19    8]
 [   3    2   10    0  923    1    6    6    7   24]
 [  14    4    6   40    7  792    8    0   16    5]
 [  12    4   15    2    6   11  904    1    3    0]
 [   1    4   21   11   11    3    0  953    2   22]
 [   6    8   13   21    9   17    4    4  887    5]
 [   9    7    2   14   29    5    0   20    6  917]]


# Problem 2

## SVM Spambase

In [6]:
clf = ml.SVM()

### Accuracy

In [9]:
print('Accuracy: ' + str(clf.cross_validate(spambase)))

Accuracy: 0.8723913043478262


# Problem 3

## SVM Digits

In [12]:
clf = ml.SVM()

### Accuracy

In [13]:
print('Total Accuracy: ' + str(clf.ovr(digits)))

Class 0 Accuracy: 0.9847142857142858
Class 1 Accuracy: 0.9814285714285714
Class 2 Accuracy: 0.9514285714285714
Class 3 Accuracy: 0.9561428571428572
Class 4 Accuracy: 0.9577142857142857
Class 5 Accuracy: 0.894
Class 6 Accuracy: 0.9158571428571428
Class 7 Accuracy: 0.8681428571428571
Class 8 Accuracy: 0.9068571428571428
Class 9 Accuracy: 0.93
Total Accuracy: 0.9346285714285715


# Problem 5

In [6]:
clf = ml.KNN()

## Accuracy

In [7]:
clf.fit(spam_train)
print('Accuracy: ' + str(clf.accuracy(spam_test)))

Accuracy: 0.9109663409337676
