## Import and load data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!git clone -b master https://github.com/charles9n/bert-sklearn
!pip install bert-sklearn/.

In [None]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

In [None]:
ROOT_PATH = '/content/drive/My Drive/Colab Notebooks (1)/sentiment_analysis/'

In [5]:
train = pd.read_csv(ROOT_PATH + 'data/train_add_gen.csv', encoding='utf-16')
print(train.shape)
train.head()

(10187, 2)


Unnamed: 0,content,ground
0,ƒê√¥i khi m√†n h√¨nh ch·∫°y ch·∫≠m...V√†o m·∫°ng nhanh n√≥...,0
1,D√πng t·ªët nh∆∞ng thi·∫øu 4g v√† b·ªô nh·ªõ h∆°i k√©m v√† m...,0
2,V√†o c√°c ·ª©ng d·ª•ng h·∫ßu nh∆∞ r·∫•t ch·∫≠m. Ch∆°i game n...,0
3,Gi·∫£m g·∫ßn 1 n·ªØa l√∫c mua. F9 ra c√≤n r·∫ª h∆°n f7 l√∫...,0
4,"ƒê√£ mua v√† s·ª≠ d·ª•ng ƒë∆∞·ª£c 2 th√°ng, r·∫•t tuy·ªát. X√†i...",1


In [None]:
from collections import Counter
Counter(train.ground)

Counter({0: 5084, 1: 5103})

In [10]:
test = pd.read_csv(ROOT_PATH + 'data/test.csv', encoding='utf-16')
print(test.shape)
test.head()

(1552, 2)


Unnamed: 0,content,ground
0,S·∫£n ph·∫©m hay b·ªã ƒë∆°! D√πng pin 4g hao nhanh. H·ªó ...,0
1,"ƒê·∫πp nh·∫•t, c·∫•u h√¨nh cao nh·∫•t, pin tr√¢u nh·∫•t tro...",1
2,"Tr√™n c·∫£ tuy·ªát v·ªùi üòç pin tr√¢u, m∆∞·ª£t, sang tr·ªçng...",1
3,S·∫£n ph·∫©m t·∫ßm trung X√†i t·∫°m ·ªïn. N√≥i chung s·∫£n p...,0
4,oppo neo 7 th·∫≠t qu√° ƒë√£. T√≠nh nƒÉng c≈©ng nh∆∞ HƒêH...,1


## Process data

In [None]:
def text_normalize(df, tokenize=False):
    # Convert text to lowercase
    df['content'] = df['content'].str.lower()
    # Remove numbers and words with numbers
    df['content'] = df['content'].str.replace('\w*\d\w*', ' ')
    # Remove punctuation
    df['content'] = df['content'].str.replace('[^\w\s]', ' ')
    # Remove whitespaces
    df['content'] =  df['content'].str.split().apply(lambda x : ' '.join(word for word in x))
    # Tokenize
    if tokenize:
        df['content'] = df['content'].apply(lambda x : ViTokenizer.tokenize(x))

In [12]:
text_normalize(train)
text_normalize(test)
test.tail()

Unnamed: 0,content,ground
1547,m√°y m√¨nh d√πng m·ªôt th·ªùi gian ng·∫Øn nh∆∞ng ch·∫≠m s·ª≠...,0
1548,cho ƒë·∫øn h√¥m nay m√°y v·∫´n b·ªã lo·∫°n c·∫£m ·ª©ng c·∫≠p nh...,0
1549,ƒë·∫πp chu·∫©n s√†i r·∫•t √™m t√¥i r·∫•t h√†i l√≤ng v·ªÅ d√≤ng ...,1
1550,m·∫∑c d√π mua m√°y c≈© nh∆∞ng tr√¥ng nh∆∞ m√°y m·ªõi m·ªçi ...,1
1551,m√°y ch·∫°y t·ªët m√¨nh h√†i l√≤ng mua t·ª´ th√°ng nƒÉm t·ªõ...,0


In [None]:
X_train = train['content']
y_train = train['ground']

X_test = test['content']
y_test = test['ground']

## BERT model

In [None]:
model = BertClassifier(max_seq_length=128,
                       train_batch_size=32,
                       epochs=5,
                       bert_model='bert-base-multilingual-cased')
model

Building sklearn text classifier...


BertClassifier(bert_config_json=None, bert_model='bert-base-multilingual-cased',
               bert_vocab=None, do_lower_case=None, epochs=5, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)

## Train

In [None]:
%%time
history = model.fit(X_train, y_train)

Loading bert-base-multilingual-cased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 9169, validation data size: 1018



HBox(children=(IntProgress(value=0, description='Training  ', max=287, style=ProgressStyle(description_width='‚Ä¶

HBox(children=(IntProgress(value=0, description='Validating', max=128, style=ProgressStyle(description_width='‚Ä¶


Epoch 1, Train loss: 0.3272, Val loss: 0.2379, Val accy: 89.78%



HBox(children=(IntProgress(value=0, description='Training  ', max=287, style=ProgressStyle(description_width='‚Ä¶

HBox(children=(IntProgress(value=0, description='Validating', max=128, style=ProgressStyle(description_width='‚Ä¶


Epoch 2, Train loss: 0.1736, Val loss: 0.2120, Val accy: 91.55%



HBox(children=(IntProgress(value=0, description='Training  ', max=287, style=ProgressStyle(description_width='‚Ä¶

HBox(children=(IntProgress(value=0, description='Validating', max=128, style=ProgressStyle(description_width='‚Ä¶


Epoch 3, Train loss: 0.0955, Val loss: 0.2797, Val accy: 91.36%



HBox(children=(IntProgress(value=0, description='Training  ', max=287, style=ProgressStyle(description_width='‚Ä¶

HBox(children=(IntProgress(value=0, description='Validating', max=128, style=ProgressStyle(description_width='‚Ä¶


Epoch 4, Train loss: 0.0536, Val loss: 0.2666, Val accy: 93.03%



HBox(children=(IntProgress(value=0, description='Training  ', max=287, style=ProgressStyle(description_width='‚Ä¶

HBox(children=(IntProgress(value=0, description='Validating', max=128, style=ProgressStyle(description_width='‚Ä¶


Epoch 5, Train loss: 0.0286, Val loss: 0.3034, Val accy: 93.03%

CPU times: user 9min 14s, sys: 3min 55s, total: 13min 10s
Wall time: 13min 29s


## Test

In [None]:
from tqdm import tqdm
# score model
accy = model.score(X_test, y_test)

# make class probability predictions
y_prob = model.predict_proba(X_test)
print("class prob estimates:\n", y_prob)

# make predictions
y_pred = model.predict(X_test)
print("Accuracy: %0.2f%%"%(metrics.accuracy_score(y_pred, y_test) * 100))

target_names = ['negative', 'positive']
print(classification_report(y_test, y_pred, target_names=target_names))

HBox(children=(IntProgress(value=0, description='Testing', max=194, style=ProgressStyle(description_width='ini‚Ä¶


Loss: 0.4273, Accuracy: 89.43%


HBox(children=(IntProgress(value=0, description='Predicting', max=194, style=ProgressStyle(description_width='‚Ä¶

class prob estimates:
 [[9.7782636e-01 2.2173658e-02]
 [8.9391734e-04 9.9910611e-01]
 [1.1498650e-03 9.9885011e-01]
 ...
 [9.5372571e-04 9.9904627e-01]
 [3.0348129e-03 9.9696523e-01]
 [3.2398552e-03 9.9676019e-01]]


HBox(children=(IntProgress(value=0, description='Predicting', max=194, style=ProgressStyle(description_width='‚Ä¶

Accuracy: 89.43%
              precision    recall  f1-score   support

    negative       0.91      0.87      0.89       758
    positive       0.88      0.92      0.90       794

    accuracy                           0.89      1552
   macro avg       0.90      0.89      0.89      1552
weighted avg       0.89      0.89      0.89      1552



In [None]:
y_pred[:10]

array([0, 1, 1, 1, 1, 0, 1, 0, 0, 1])

In [None]:
y_test[:10].values

array([0, 1, 1, 0, 1, 0, 1, 0, 0, 1])

In [None]:
print('Ground\tPred\tText')
for i in range(10):
    print(str(y_test[i]) + '\t' + str(y_pred[i]) + '\t' + X_test[i])

Ground	Pred	Text
0	0	s·∫£n ph·∫©m hay b·ªã ƒë∆° d√πng pin hao nhanh h·ªó tr·ª£ s·∫°c nhanh ko th·∫≠t s·ª± hi·ªáu qu·∫£ ch·ª•p ·∫£nh ch·ªâ d·ª´ng ·ªü m·ª©c ch·∫≠p nh·∫≠n ƒëc ko lung linh nh∆∞ h√£ng kh√°c c√πng t·∫ßm gi√°
1	1	ƒë·∫πp nh·∫•t c·∫•u h√¨nh cao nh·∫•t pin tr√¢u nh·∫•t trong t·∫ßm gi√° r·∫•t h·ª£p v·ªõi m·ªôt ki·∫øn tr√∫c s∆∞
1	1	tr√™n c·∫£ tuy·ªát v·ªùi pin tr√¢u m∆∞·ª£t sang tr·ªçng kh√¥ng r·ªùi em n√≥ n·ª≠a b∆∞·ªõc c√¢y b√∫t d·ªÖ d√πng
0	1	s·∫£n ph·∫©m t·∫ßm trung x√†i t·∫°m ·ªïn n√≥i chung s·∫£n ph√¢m b√¨nh th∆∞·ªùng kh√¥ng c√≥ ∆∞u ƒëi·ªÉm n·ªïi b√¢t
1	1	oppo neo th·∫≠t qu√° ƒë√£ t√≠nh nƒÉng c≈©ng nh∆∞ hƒëh ph√π h·ª£p v·ªõi gi·ªõi tr·∫ª hi·ªán nay nh∆∞ng gi√° c·∫£ n√™ gi·∫£m l·∫°i m·ªôt ch√∫t n·ªØa th√¨ b√°n ch·∫°y h∆°n nhi·ªÅu
0	0	l√∫c ƒë·∫ßu m√¨nh s·ª≠ d·ª•ng c≈©ng hay b·ªã l√µi nh∆∞ng khi c·∫≠p nh·∫≠t ph·∫ßn m·ªÅm v√° l·ªói th√¨ ok v·ªõi m·ª©c gi√° t√¥i kh√¥ng th√≠ch gh√©t c√°i s√≥ng m·∫°ng r·∫•t y·∫øu nh·∫•t l√† c√°i ph·∫ßn m·ªÅm n√¢ng c·∫•p n√≥ l√†m m·∫•t ƒëi ch·ª©c nƒ

# Save model

In [None]:
savefile = ROOT_PATH + 'resource/BERT_0102.bin'

In [None]:
# save model to disk
model.save(savefile)

In [14]:
# load model
BERT = load_model(savefile)

BERT.score(X_test, y_test)

Loading model from /content/drive/My Drive/Colab Notebooks (1)/sentiment_analysis/resource/BERT_0102.bin...
Defaulting to linear classifier/regressor
Building sklearn text classifier...


HBox(children=(IntProgress(value=0, description='Testing', max=194, style=ProgressStyle(description_width='ini‚Ä¶



Loss: 0.4273, Accuracy: 89.43%


89.43298969072166