In [12]:
import numpy as np
import pandas as pd
import torch
import re
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from pyvi import ViTokenizer, ViPosTagger
from statistics import mean
from sklearn.metrics import classification_report, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

PhobertTokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
data_df_sw = pd.read_csv("./data/data_have_sw.csv")
data_df_no_sw = pd.read_csv("./data/data_no_sw.csv")
data_hs_sw_copy = data_df_sw.copy()
data_df_no_sw_copy = data_df_no_sw.copy()

In [13]:
tqdm.pandas()

In [38]:
from tqdm import tqdm
def extract_features(s):
    inputs = PhobertTokenizer(s, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        doc = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
    return doc[0].numpy()
def split_content(list_tokens, chunk_size):
    chunked_list = []
    for i in range(0, len(list_tokens), chunk_size):
        chunked_list.append(" ".join(list_tokens[i:i+chunk_size]))
    return chunked_list

def to_vector(data_frame, col1, col2):
    data_frame[col2] = data_frame[col1].progress_apply(lambda x: np.mean(np.array([extract_features(i) for i in x]), axis=0)) 
    return data_frame

In [41]:
data_hs_sw_copy["content_split"] = data_hs_sw_copy["token"].apply(lambda x : split_content(x.split(" "), 200))
# data_copy_rm_sw["content_split"] = data_copy_rm_sw["token"].apply(lambda x : split_content(x.split(" "), 200))


In [45]:
data_copy_rm_sw["content_split"] = data_copy_rm_sw["token"].apply(lambda x : split_content(str(x).split(" "), 200))

In [48]:
data_embedding1 = to_vector(data_hs_sw_copy, 'content_split', 'vector')
data_embedding2 = to_vector(data_copy_rm_sw, 'content_split', 'vector')

100%|██████████| 29776/29776 [1:06:32<00:00,  7.46it/s]
100%|██████████| 29771/29771 [50:23<00:00,  9.85it/s]


In [49]:
X_vector1 = np.array(data_embedding1["vector"].tolist())
X_vector2 = np.array(data_embedding2["vector"].tolist())
np.save('/content/drive/MyDrive/do_an/data/data_vector_bert1.npy ', X_vector1, allow_pickle=True)
np.save('/content/drive/MyDrive/do_an/data/data_vector_bert_rm_sw1.npy ', X_vector2, allow_pickle=True)

In [14]:
X_train_has_sw = np.load('D:\do_an\data\data_vector_bert1.npy .npy')
X_train_no_sw = np.load('D:\do_an\data\data_vector_bert_rm_sw1.npy .npy')

In [15]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data_hs_sw_copy['label'] = le.fit_transform(data_hs_sw_copy['label'].astype(str))
data_df_no_sw_copy['label'] = le.fit_transform(data_df_no_sw_copy['label'].astype(str))

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(X_train_has_sw, data_hs_sw_copy['label'], test_size=0.2, shuffle=True, random_state=42) 
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(X_train_no_sw, data_df_no_sw_copy['label'], test_size=0.2, shuffle=True, random_state=42) 

In [18]:
from sklearn.linear_model import LogisticRegression

lr_model_1 = LogisticRegression(multi_class='ovr', solver='sag')
lr_model_1.fit(x_train_1, y_train_1)
y_predict_1 = lr_model_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.8625    0.8504    0.8564      2339
           1     0.9041    0.9124    0.9082      3617

    accuracy                         0.8880      5956
   macro avg     0.8833    0.8814    0.8823      5956
weighted avg     0.8878    0.8880    0.8879      5956



In [19]:
lr_model_2 = LogisticRegression(multi_class='ovr', solver='sag')
lr_model_2.fit(x_train_2, y_train_2)
y_predict_2 = lr_model_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.7979    0.7587    0.7778      2337
           1     0.8489    0.8759    0.8622      3618

    accuracy                         0.8299      5955
   macro avg     0.8234    0.8173    0.8200      5955
weighted avg     0.8289    0.8299    0.8291      5955



In [20]:
from sklearn.svm import SVC

svc_model_1 = SVC(decision_function_shape = 'ovo', verbose=10 )
svc_model_1.fit(x_train_1, y_train_1)

y_predict_1 = svc_model_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



svc_model_2 = SVC(decision_function_shape = 'ovo', verbose=10 )
svc_model_2.fit(x_train_2, y_train_2)

y_predict_2 = svc_model_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

[LibSVM]              precision    recall  f1-score   support

           0     0.8810    0.8546    0.8676      2339
           1     0.9078    0.9254    0.9165      3617

    accuracy                         0.8976      5956
   macro avg     0.8944    0.8900    0.8921      5956
weighted avg     0.8973    0.8976    0.8973      5956

[LibSVM]              precision    recall  f1-score   support

           0     0.8349    0.7681    0.8001      2337
           1     0.8576    0.9019    0.8792      3618

    accuracy                         0.8494      5955
   macro avg     0.8462    0.8350    0.8396      5955
weighted avg     0.8487    0.8494    0.8481      5955



In [21]:
from sklearn import tree

clf_1 = tree.DecisionTreeClassifier()
clf_1.fit(x_train_1, y_train_1)

y_predict_1 = clf_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



clf_2 = tree.DecisionTreeClassifier()

clf_2.fit(x_train_2, y_train_2)

y_predict_2 = clf_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.6074    0.6310    0.6190      2339
           1     0.7552    0.7362    0.7456      3617

    accuracy                         0.6949      5956
   macro avg     0.6813    0.6836    0.6823      5956
weighted avg     0.6972    0.6949    0.6959      5956

              precision    recall  f1-score   support

           0     0.5578    0.5533    0.5555      2337
           1     0.7130    0.7167    0.7148      3618

    accuracy                         0.6526      5955
   macro avg     0.6354    0.6350    0.6352      5955
weighted avg     0.6521    0.6526    0.6523      5955



In [22]:
from sklearn.naive_bayes import GaussianNB

gnb_1 = GaussianNB()
gnb_1.fit(x_train_1, y_train_1)

y_predict_1 = gnb_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



gnb_2 = GaussianNB()

gnb_2.fit(x_train_2, y_train_2)

y_predict_2 = gnb_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.6992    0.7961    0.7445      2339
           1     0.8551    0.7785    0.8151      3617

    accuracy                         0.7854      5956
   macro avg     0.7772    0.7873    0.7798      5956
weighted avg     0.7939    0.7854    0.7873      5956

              precision    recall  f1-score   support

           0     0.6543    0.6778    0.6658      2337
           1     0.7869    0.7687    0.7777      3618

    accuracy                         0.7330      5955
   macro avg     0.7206    0.7232    0.7218      5955
weighted avg     0.7349    0.7330    0.7338      5955



In [23]:
from sklearn.ensemble import RandomForestClassifier
rfc_1 = RandomForestClassifier(max_depth=None, random_state=0)
rfc_1.fit(x_train_1, y_train_1)

y_predict_1 = rfc_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



rfc_2 = RandomForestClassifier(max_depth=None, random_state=0)

rfc_2.fit(x_train_2, y_train_2)

y_predict_2 = rfc_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.8393    0.7392    0.7861      2339
           1     0.8434    0.9085    0.8748      3617

    accuracy                         0.8420      5956
   macro avg     0.8414    0.8238    0.8304      5956
weighted avg     0.8418    0.8420    0.8399      5956

              precision    recall  f1-score   support

           0     0.8061    0.6175    0.6993      2337
           1     0.7854    0.9041    0.8405      3618

    accuracy                         0.7916      5955
   macro avg     0.7957    0.7608    0.7699      5955
weighted avg     0.7935    0.7916    0.7851      5955

