In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/project'

/content/drive/MyDrive/project


In [None]:
!ls

BML_giuaky.pptx  dataword_model.save   naivebayes.sav
data		 naivebayes_foody.sav  SentimentAnalysis_NB.ipynb
data_test	 naive_bayes.ipynb     wiki.vi.model.bin.gz


Load data và chia tập train, tập test

In [None]:
import pandas as pd
df_train = pd.read_csv('data/train_foody_processed.csv')
df_test = pd.read_csv('data/test_foody_processed.csv')
X_train = df_train['comment']
y_train = df_train['label'].values
X_test = df_test['comment']
y_test = df_test['label'].values
print('Training samples:',X_train.shape)
print(y_train.shape)
print('Testing samples:',X_test.shape)
print(y_test.shape)
data = X_train.tolist()
data.extend(X_test.tolist())
print(len(data))

Training samples: (40000,)
(40000,)
Testing samples: (10000,)
(10000,)
50000


In [None]:
import numpy as np 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import metrics 
from scipy import io
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler

Trích xuất đặc trưng bằng Tfidf và phân lớp Multinomial Naive Bayes

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,5), max_df=0.5, min_df=5, smooth_idf=True, sublinear_tf=True, norm='l2', use_idf=False)
X = tfidf_vectorizer.fit_transform(X_train)
X_test_new = tfidf_vectorizer.transform(X_test)
estimator = MultinomialNB(alpha=0.01)

In [None]:
estimator.fit(X,y_train)
y_pred = estimator.predict(X_test_new)
print("Accuracy: ", 100*metrics.accuracy_score(y_test, y_pred))
print("Precision: ", 100*metrics.precision_score(y_test, y_pred))
print("Recall (macro): ", 100*metrics.recall_score(y_test, y_pred, average='macro'))
print("Recall(micro): ", 100*metrics.recall_score(y_test, y_pred, average='micro'))
print("F1-scores(macro): ", 100*metrics.f1_score(y_test, y_pred, average='macro'))
print("F1-scores(micro): ", 100*metrics.f1_score(y_test, y_pred, average='micro'))

Accuracy:  89.35
Precision:  87.21392093815018
Recall (macro):  89.35
Recall(micro):  89.35
F1-scores(macro):  89.34122046988884
F1-scores(micro):  89.35


Save model bằng pickle

In [None]:
import pickle
# save the model to disk
filename = '/content/drive/MyDrive/project/tfidf.sav'
pickle.dump(estimator, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(X_test_new)
print("Accuracy: ", 100*metrics.accuracy_score(y_test, y_pred))

Accuracy:  89.35


Trích xuất đặc trưng bằng BOW và phân lớp Multinomial Naive Bayes

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,5), max_df=0.5, min_df=5)
X_train_new = vectorizer.fit_transform(X_train)
vocab =  vectorizer.get_feature_names()
X_test_new = vectorizer.transform(X_test)
print('Vocab size:',len(vocab))

Vocab size: 178138


In [None]:
clf = MultinomialNB(alpha=0.01)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)
print("Accuracy: ", 100*metrics.accuracy_score(y_test, y_pred))
print("Precision: ", 100*metrics.precision_score(y_test, y_pred))
print("Recall (macro): ", 100*metrics.recall_score(y_test, y_pred, average='macro'))
print("Recall(micro): ", 100*metrics.recall_score(y_test, y_pred, average='micro'))
print("F1-scores(macro): ", 100*metrics.f1_score(y_test, y_pred, average='macro'))
print("F1-scores(micro): ", 100*metrics.f1_score(y_test, y_pred, average='micro'))

Accuracy:  88.86
Precision:  85.95484826054775
Recall (macro):  88.86
Recall(micro):  88.86
F1-scores(macro):  88.84178801272284
F1-scores(micro):  88.86


Save model bằng pickle

In [None]:
filename = '/content/drive/MyDrive/project/bow.sav'
pickle.dump(clf, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(X_test_new)
print("Accuracy: ", 100*metrics.accuracy_score(y_test, y_pred))

Accuracy:  88.86


Tìm tham số phù hợp bằng GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.5, min_df=5)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train, y_train)
parameters = {'ngram_range': [(1, 2), (1, 5)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)







ValueError: ignored

In [None]:
gs_clf.best_score_
gs_clf.best_params_
