## 1. Import the necessary libraries

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from joblib import dump,load
import import_ipynb
from EDASentimentAnalysis import remove_punctuation,remove_stopword
from pyvi import ViTokenizer
from sklearn.model_selection import GridSearchCV

## 2 Data preprocessing

>Read csv

In [80]:
def ReadData(path):
    df=pd.read_csv(path,encoding='utf-8')
    return df['comment'], df['label']

> Execute function

In [81]:
X_train,y_train = ReadData("../Data/trainprocessed.csv")
X_test,y_test=ReadData("../Data/testprocesssed.csv")

In [82]:
print(X_test[128])

sản_phẩm sài tạm thích camera pin trâu còn đt sài ko biết bạn nào giống mình ko sài ứng_dụng dể một tối bấm hoài mới cài nhạc chuông điện_thoại hay mất


In [83]:
print(f"X_Train size:{X_train.shape}")
print(f"y_train size {y_train.shape}")

X_Train size:(7786,)
y_train size (7786,)


> word separation 

In [84]:
def wordseparation(comment):
   tokens = comment.split()
   return tokens

> Use tfidf to represent words

In [85]:
vectorizer = TfidfVectorizer(tokenizer=wordseparation)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [86]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(7786, 11616)
(2224, 11616)


In [87]:
print(X_train_tfidf[0])

  (0, 9525)	0.20532620334127655
  (0, 3432)	0.2770147570538441
  (0, 1374)	0.2080171949252728
  (0, 1203)	0.4410525799762897
  (0, 9463)	0.27953535743972613
  (0, 1187)	0.461233073219486
  (0, 5405)	0.09277368832992802
  (0, 2099)	0.2123478573430905
  (0, 5356)	0.3720550178477184
  (0, 2502)	0.2908060151838795
  (0, 4550)	0.20100766989156033
  (0, 7308)	0.18797274037615602


In [88]:
print(X_test_tfidf[0])

  (0, 11533)	0.22741057931285663
  (0, 10878)	0.23094620044084646
  (0, 10791)	0.22312866461732825
  (0, 10753)	0.3278406155349909
  (0, 10644)	0.19484094877670352
  (0, 10596)	0.18401389447426655
  (0, 10091)	0.1382244608091237
  (0, 9348)	0.21525059105692088
  (0, 9128)	0.11580924638570829
  (0, 8532)	0.3093888771965711
  (0, 8358)	0.12903997888174068
  (0, 7308)	0.07413489274185271
  (0, 6201)	0.10761996330810683
  (0, 6067)	0.11323791848667054
  (0, 5841)	0.16645299177195036
  (0, 5749)	0.16995094303752903
  (0, 5657)	0.10365659105344005
  (0, 5468)	0.1195616325876436
  (0, 5193)	0.19993193343106636
  (0, 4760)	0.1654365281419453
  (0, 4316)	0.12938894753143007
  (0, 4284)	0.187816094151225
  (0, 2755)	0.1770243163088608
  (0, 1985)	0.18894106265872246
  (0, 807)	0.26311639709111145
  (0, 801)	0.3278406155349909


## 3.Built Model KNN

### 3.1 Use k-folk technique to find the best set of parameters

In [89]:
param_grid = {'n_neighbors': np.random.randint(1, 101, size=100)}
print(param_grid)
grid_search = GridSearchCV(KNeighborsClassifier(metric='cosine'), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

{'n_neighbors': array([ 18,  41,  77,  78,  95,  86,   9,  34,  66,  37,  75,  67,  10,
        29,  53,  51,  96,  68,  40,  85,  56,  59,  27,  54,   7,   6,
        60,   8,  19, 100,  63,  41,  23,  32,  38,  81,  57,  97,  28,
         7,  67,  84,  54,  76,  51,  71,  50,   7,  44,  34,  77,  91,
        92,  41,  89,  64,  63,  55,  80,  50,  51,  21,  72,  20,  12,
        25,  80,  73,  78,  24,  78,  37,  44,   2,  66,  59,   3,  29,
        99,  43,  81,  65,  55,  83,  12,  96,  40,   7,   1,  79,  60,
         2,  21,   3,  30,  33,  72,  10,  61,  39])}
Best parameters: {'n_neighbors': 33}
Best cross-validation score: 0.7617520114963852


### 3.2 Model KNN

In [90]:
model_knn=KNeighborsClassifier(n_neighbors=33,metric='cosine')

### 3.3 Train KNN

In [91]:
model_knn.fit(X_train_tfidf,y_train)

> dump file knn_model_sentiment.pkl

In [92]:
dump(model_knn, '../Model/knn_model_sentiment.pkl')

['../Model/knn_model_sentiment.pkl']

### 3.4. Evaluating

In [93]:
model_knn_loaded=load('../Model/knn_model_sentiment.pkl')

> predicting a sample

> Function Datapreprocessing

In [94]:
def Proprocessing(comment):
    comment= remove_punctuation(comment.lower())
    comment= remove_stopword(comment)
    comment= [ViTokenizer.tokenize(comment)]
    comment=vectorizer.transform(comment)
    return comment

> Sample

In [95]:
comment="Máy thiết kế quá đẹp ,dùng dk mấy hôm r thấy máy vẫn ổn ,tiếc là pin tụt quá nhanh ,pin sạc thì nhanh nóng 😌"
comment=Proprocessing(comment)
print(model_knn_loaded.predict(comment))

['Positive']


> predict test dataset

In [96]:
result_predict=model_knn_loaded.predict(X_test_tfidf)

> Evaluate the model through Accuracy measures,Precision,Recall,F1-score,

In [97]:
accuracy=accuracy_score(y_test,result_predict)
print(f'Accuracy: {accuracy:.4f}')
precision=precision_score(y_test,result_predict,average='weighted')
print(f"Precision: {precision:.4f}")
recall=precision_score(y_test,result_predict,average='weighted')
print(f"Recall: {recall:.4f}")
f1score=f1_score(y_test,result_predict,average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 0.7666
Precision: 0.7931
Recall: 0.7931
F1-score: 0.7260883665378702
