# CIS520 Group Project
## SVM
### Author: Hui Lyu

In [1]:
%matplotlib inline

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, grid_search
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.utils import resample
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from collections import OrderedDict



In [30]:
X_train = {}
X_test = {}
Y_train = {}
Y_test = {}

In [31]:
path_Xtrain = "/Users/huilyu/Documents/Penn CIS/CIS 520 Spring 2018/Project/word_pos_large/X_train/"  
path_Xtest = "/Users/huilyu/Documents/Penn CIS/CIS 520 Spring 2018/Project/word_pos_large/X_test/"
path_Ytrain = "/Users/huilyu/Documents/Penn CIS/CIS 520 Spring 2018/Project/word_pos_large/Y_train/"
path_Ytest = "/Users/huilyu/Documents/Penn CIS/CIS 520 Spring 2018/Project/word_pos_large/Y_test/"

In [32]:
# Reload upsampled dataset
all_words = os.listdir(path_Xtest)
all_words

['seem.txt', 'have.txt', 'result.txt', 'more.txt', 'be.txt', 'child.txt']

In [33]:
for word in all_words:
    X_train[word] = np.loadtxt(path_Xtrain+word, delimiter = ",")
    Y_train[word] = np.loadtxt(path_Ytrain+word, delimiter = ",")
    X_test[word] = np.loadtxt(path_Xtest+word, delimiter = ",")
    Y_test[word] = np.loadtxt(path_Ytest+word, delimiter = ",")

In [35]:
len(X_test)

6

In [37]:
# Cross-Validation for tuning parameters

Cs = [0.01, 0.1, 1, 10, 100]
gammas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
degrees = [2, 3, 4, 5]

C_lin_dict = {}
C_rbf_dict = {}
C_poly_dict = {}
gamma_dict = {}
degree_dict = {}

param_grid_rbf = {'C': Cs, 'gamma' : gammas}
param_grid_poly = {'C': Cs, 'degree' : degrees}
param_grid = {'C': Cs}

# stratified k fold as default
grid_linear = grid_search.GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=3)
grid_rbf = grid_search.GridSearchCV(svm.SVC(kernel='rbf'), param_grid_rbf, cv=3)
grid_poly = grid_search.GridSearchCV(svm.SVC(kernel='poly'), param_grid_poly, cv=3)

In [38]:
# RBF SVM models
rbf_svc = {}

for word in all_words:
    grid_rbf.fit(X_train[word], Y_train[word])
    C_rbf_dict[word] = grid_rbf.best_params_['C']
    gamma_dict[word] = grid_rbf.best_params_['gamma']
    rbf_svc[word] = svm.SVC(kernel='rbf', C=C_rbf_dict[word], gamma=gamma_dict[word]).fit(X_train[word], Y_train[word])
#    rbf_svc[word] = svm.SVC(kernel='rbf', gamma=0.01).fit(trainX[word], trainY[word])

In [39]:
# Linear SVM models
lin_svc = {}

for word in all_words:
    grid_linear.fit(X_train[word], Y_train[word])
    C_lin_dict[word] = grid_linear.best_params_['C']
    lin_svc[word] = svm.SVC(kernel='linear', C=C_lin_dict[word]).fit(X_train[word], Y_train[word])
#    lin_svc[word] = svm.SVC(kernel='linear').fit(trainX[word], trainY[word])

In [40]:
# Poly degree SVM modesl
poly_svc = {}

for word in all_words:
    grid_poly.fit(X_train[word], Y_train[word])
    C_poly_dict[word] = grid_poly.best_params_['C']
    degree_dict[word] = grid_poly.best_params_['degree']
    poly_svc[word] = svm.SVC(kernel='poly', C=C_poly_dict[word], degree=degree_dict[word]).fit(X_train[word], Y_train[word])    
#    poly_svc[word] = svm.SVC(kernel='poly', degree=3).fit(trainX[word], trainY[word])

In [41]:
lin_accuracy = {}
lin_y_pred = {}
for word in all_words:
    lin_y_pred[word] = lin_svc[word].predict(X_test[word])
    lin_accuracy[word] = accuracy_score(Y_test[word], lin_y_pred[word])
lin_accuracy

{'seem.txt': 0.8983050847457628,
 'have.txt': 0.5573770491803278,
 'result.txt': 0.8235294117647058,
 'more.txt': 0.8045112781954887,
 'be.txt': 0.6524621212121212,
 'child.txt': 0.5952380952380952}

In [42]:
rbf_accuracy = {}
rbf_y_pred = {}
for word in all_words:
    rbf_y_pred[word] = rbf_svc[word].predict(X_test[word])
    rbf_accuracy[word] = accuracy_score(Y_test[word], rbf_y_pred[word])
rbf_accuracy

{'seem.txt': 0.8983050847457628,
 'have.txt': 0.5717213114754098,
 'result.txt': 0.7058823529411765,
 'more.txt': 0.7518796992481203,
 'be.txt': 0.6553030303030303,
 'child.txt': 0.7142857142857143}

In [43]:
poly_accuracy = {}
poly_y_pred = {}
for word in all_words:
    poly_y_pred[word] = poly_svc[word].predict(X_test[word])
    poly_accuracy[word] = accuracy_score(Y_test[word], poly_y_pred[word])
poly_accuracy

{'seem.txt': 0.8898305084745762,
 'have.txt': 0.5532786885245902,
 'result.txt': 0.7058823529411765,
 'more.txt': 0.7894736842105263,
 'be.txt': 0.6527777777777778,
 'child.txt': 0.7142857142857143}

In [44]:
lin_f1 = {}
rbf_f1 = {}
poly_f1 = {}
for word in all_words:
    lin_f1[word] = f1_score(Y_test[word], lin_y_pred[word], average='weighted')
    rbf_f1[word] = f1_score(Y_test[word], rbf_y_pred[word], average='weighted')
    poly_f1[word] = f1_score(Y_test[word], poly_y_pred[word], average='weighted')

  'precision', 'predicted', average, warn_for)


In [45]:
OrderedDict(sorted(lin_f1.items()))

OrderedDict([('be.txt', 0.5803507738580522),
             ('child.txt', 0.5575477154424523),
             ('have.txt', 0.4222914152438617),
             ('more.txt', 0.7378117009695957),
             ('result.txt', 0.8225140331940498),
             ('seem.txt', 0.8798493408662899)])

In [46]:
list(OrderedDict(sorted(lin_f1.items())).values())

[0.5803507738580522,
 0.5575477154424523,
 0.4222914152438617,
 0.7378117009695957,
 0.8225140331940498,
 0.8798493408662899]

In [47]:
path = "/Users/huilyu/Documents/Penn CIS/CIS 520 Spring 2018/Project/word_pos_large/"

np.savetxt(path + 'f1-LinearSVM.txt', np.array(list(OrderedDict(sorted(lin_f1.items())).values())), delimiter=',')

np.savetxt(path + 'f1-RBFSVM.txt', np.array(list(OrderedDict(sorted(rbf_f1.items())).values())), delimiter=',')

np.savetxt(path + 'f1-PolySVM.txt', np.array(list(OrderedDict(sorted(poly_f1.items())).values())), delimiter=',')

np.savetxt(path + 'accuracy-LinearSVM.txt', np.array(list(OrderedDict(sorted(lin_accuracy.items())).values())), delimiter=',')

np.savetxt(path + 'accuracy-RBFSVM.txt', np.array(list(OrderedDict(sorted(rbf_accuracy.items())).values())), delimiter=',')

np.savetxt(path + 'accuracy-PolySVM.txt', np.array(list(OrderedDict(sorted(poly_accuracy.items())).values())), delimiter=',')

In [48]:
OrderedDict(sorted(rbf_f1.items()))

OrderedDict([('be.txt', 0.5689688736904099),
             ('child.txt', 0.5952380952380951),
             ('have.txt', 0.4397614306443148),
             ('more.txt', 0.7340686274509804),
             ('result.txt', 0.653874883286648),
             ('seem.txt', 0.8798493408662899)])

In [49]:
OrderedDict(sorted(poly_f1.items()))

OrderedDict([('be.txt', 0.559531462918507),
             ('child.txt', 0.5952380952380951),
             ('have.txt', 0.41686499020292633),
             ('more.txt', 0.7191410959832013),
             ('result.txt', 0.6720042740803986),
             ('seem.txt', 0.8660223549579251)])

In [50]:
sum(lin_f1.values()) / len(lin_f1)

0.666727496595717

In [51]:
sum(rbf_f1.values()) / len(rbf_f1)

0.6452935418627895

In [52]:
sum(poly_f1.values()) / len(poly_f1)

0.6381337122301756

In [53]:
sum(rbf_accuracy.values()) / len(rbf_accuracy)

0.7162295321665356

In [54]:
sum(lin_accuracy.values()) / len(lin_accuracy)

0.7219038400560835

In [55]:
sum(poly_accuracy.values()) / len(poly_accuracy)

0.7175881210357269