# KNN Supervised Learning Without BERT

In [1]:
%load_ext autoreload
%autoreload 2

from preprocessing import prepare_data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

## Load Data

In [2]:
data = prepare_data()
data.head()

[nltk_data] Downloading package punkt to /Users/yoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category,rating,label,text
1,1,5.0,CG,love exclam well made sturdi comfort i love ex...
2,1,5.0,CG,love great upgrad origin i quotat mine coupl year
3,1,5.0,CG,thi pillow save back i love look feel pillow
4,1,1.0,CG,miss inform use great product price exclam i
5,1,5.0,CG,veri nice set good qualiti we set two month


# Training

In [3]:
data.describe(include='all')


Unnamed: 0,category,rating,label,text
count,40432.0,40432.0,40432,40432
unique,,5.0,2,40368
top,,5.0,CG,div quotat quotat input quotationhidden quotat...
freq,,24559.0,20216,7
mean,0.5,,,
std,0.500006,,,
min,0.0,,,
25%,0.0,,,
50%,0.5,,,
75%,1.0,,,


In [4]:
# Convert summary to a matrix of token counts

# cv = CountVectorizer()
cv = CountVectorizer(lowercase=True)
count_vector = cv.fit_transform(data['text'])

print(count_vector.shape)
count_vector.toarray()

(40432, 27065)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
# Splitting data
x_train, x_test, y_train, y_test = train_test_split(count_vector, data['category'], test_size=0.2)


In [6]:
clf_linear = SVC(kernel='linear')
clf_poly = SVC(kernel='poly')
clf_rbf = SVC(kernel='rbf')


In [10]:
clf_linear.fit(x_train, y_train)

In [11]:
clf_poly.fit(x_train, y_train)

In [12]:
clf_rbf.fit(x_train, y_train)

In [13]:
y_pred_linear = clf_linear.predict(x_test)
y_pred_poly = clf_poly.predict(x_test)
y_pred_rbf = clf_rbf.predict(x_test)


In [14]:
print("Accuracy (y_pred_linear):",metrics.accuracy_score(y_test, y_pred_linear))
print("Precision (y_pred_linear):",metrics.precision_score(y_test, y_pred_linear))
print("Recall (y_pred_linear):",metrics.recall_score(y_test, y_pred_linear))
print("F1 (y_pred_linear):",metrics.f1_score(y_test, y_pred_linear))
print('=============')
print("Accuracy (y_pred_poly):",metrics.accuracy_score(y_test, y_pred_poly))
print("Precision (y_pred_poly):",metrics.precision_score(y_test, y_pred_poly))
print("Recall (y_pred_poly):",metrics.recall_score(y_test, y_pred_poly))
print("F1 (y_pred_poly):",metrics.f1_score(y_test, y_pred_poly))
print('=============')
print("Accuracy (y_pred_rbf):",metrics.accuracy_score(y_test, y_pred_rbf))
print("Precision (y_pred_rbf):",metrics.precision_score(y_test, y_pred_rbf))
print("Recall (y_pred_rbf):",metrics.recall_score(y_test, y_pred_rbf))
print("F1 (y_pred_rbf):",metrics.f1_score(y_test, y_pred_rbf))


Accuracy (y_pred_linear): 0.8678125386422654
Precision (y_pred_linear): 0.8596195521309896
Recall (y_pred_linear): 0.8801775147928994
F1 (y_pred_linear): 0.8697770739432331
Accuracy (y_pred_poly): 0.6572276493137134
Precision (y_pred_poly): 0.9120667522464698
Recall (y_pred_poly): 0.35034516765285995
F1 (y_pred_poly): 0.5062344139650873
Accuracy (y_pred_rbf): 0.8644738469148016
Precision (y_pred_rbf): 0.8832211289487313
Recall (y_pred_rbf): 0.8409763313609467
F1 (y_pred_rbf): 0.8615812073755998


In [18]:
for reg in [0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]: 
    clf_linear = SVC(C = reg, kernel = 'linear')
    clf_linear.fit(x_train, y_train)
    y_pred_linear = clf_linear.predict(x_test)
    print('reg: ', reg)
    print("Accuracy (y_pred_linear):",metrics.accuracy_score(y_test, y_pred_linear))
    # print("Precision (y_pred_linear):",metrics.precision_score(y_test, y_pred_linear))
    # print("Recall (y_pred_linear):",metrics.recall_score(y_test, y_pred_linear))
    # print("F1 (y_pred_linear):",metrics.f1_score(y_test, y_pred_linear))
    print('=============')



reg:  2.0
Accuracy (y_pred_linear): 0.8645975021639669
Precision (y_pred_linear): 0.8532569792412312
Recall (y_pred_linear): 0.8816568047337278
F1 (y_pred_linear): 0.8672244452528193
