In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding,Dense,Dropout,Bidirectional,LSTM,GRU,Input,GlobalAveragePooling1D,LayerNormalization
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from pyvi import ViUtils
from tqdm import tqdm
tqdm.pandas()
import fasttext
import fasttext.util

In [2]:
from tensorflow.keras.layers import Conv1D,MaxPooling1D

# load data

In [3]:
data_df_sw = pd.read_csv("./data/data_have_sw.csv")
data_df_no_sw = pd.read_csv("./data/data_no_sw.csv")

In [4]:
data_df_sw_copy = data_df_sw.copy()
data_df_no_sw_copy = data_df_no_sw.copy()

In [5]:
data_df_sw_copy["token"] = data_df_sw_copy.progress_apply(lambda x: x['token'].split(" "), axis = 1)
data_df_no_sw_copy["token"] = data_df_no_sw_copy.progress_apply(lambda x: str(x['token']).split(" "), axis = 1)

100%|█████████████████████████████████████████████████████████████████████████| 29776/29776 [00:00<00:00, 66393.91it/s]
100%|█████████████████████████████████████████████████████████████████████████| 29771/29771 [00:00<00:00, 82307.34it/s]


In [18]:
from gensim.models import FastText

In [20]:
ftmodel_hs_sw = FastText(sentences= data_df_sw_copy["token"] ,vector_size=300  ,window =15 , min_count=5)
ftmodel_no_sw = FastText(sentences= data_df_no_sw_copy["token"] ,vector_size=300  ,window =15 , min_count=5)

In [26]:
def get_fastText_train(word,ftmodel):
    try:
        embedding=ftmodel[word]
    except:
        embedding=np.zeros((300,))
    return embedding 
def get_embedding(word):
    try:
        embedding=fasttex_model[word]
    except:
        embedding=np.zeros((300,))
    return embedding 

def word_tokenize_(data_frame, col1, col2):
    data_frame[col2] = data_frame[col1].apply(lambda x: list(set(word_tokenize(str(x)))) )
    return data_frame 

def convert_to_array(data_frame, col):
    return np.array(data_frame[col].tolist()).reshape(-1,1)

def create_vector_train(list_token_):
    return [np.mean(np.array(list(map(get_embedding,tok_sent[0]))), axis = 0) for tok_sent in list_token_]

In [22]:
fasttex_model = fasttext.load_model('./fasttext/cc.vi.300.bin')



In [24]:
data_train_has_sw = convert_to_array(data_df_sw_copy,"token")
data_train_no_sw = convert_to_array(data_df_no_sw_copy,"token")



In [35]:
X_train_has_sw = []
for ele in tqdm(range(len(data_train_has_sw))):
    X_train_has_sw.append(data_train_has_sw[ele])

X_train_has_sw = create_vector_train(X_train_has_sw)

100%|███████████████████████████████████████████████████████████████████████| 29776/29776 [00:00<00:00, 1751410.72it/s]


In [36]:
X_train_no_sw = []
for ele in tqdm(range(len(data_train_no_sw))):
    X_train_no_sw.append(data_train_no_sw[ele])

X_train_no_sw = create_vector_train(X_train_no_sw)

100%|███████████████████████████████████████████████████████████████████████| 29771/29771 [00:00<00:00, 1240486.63it/s]


In [38]:
print(len(X_train_has_sw))
print(len(X_train_no_sw))

29776
29771


In [52]:
np.save('./data/data_vector_fastext_has_sw.npy ', X_train_has_sw, allow_pickle=True)
np.save('./data/data_vector_fastext_no_sw.npy ', X_train_no_sw, allow_pickle=True)

# ML

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data_df_sw_copy['label'] = le.fit_transform(data_df_sw_copy['label'].astype(str))
data_df_no_sw_copy['label'] = le.fit_transform(data_df_no_sw_copy['label'].astype(str))

In [7]:
X_train_has_sw = np.load('D:\do_an\data\data_vector_fastext_has_sw.npy .npy')
X_train_no_sw = np.load('D:\do_an\data\data_vector_fastext_no_sw.npy .npy')

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(X_train_has_sw, data_df_sw_copy['label'], test_size=0.2, shuffle=True, random_state=42) 
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(X_train_no_sw, data_df_no_sw_copy['label'], test_size=0.2, shuffle=True, random_state=42) 

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [11]:
lr_model_1 = LogisticRegression(multi_class='ovr', solver='sag')
lr_model_1.fit(x_train_1, y_train_1)
y_predict_1 = lr_model_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))


lr_model_2 = LogisticRegression(multi_class='ovr', solver='sag')
lr_model_2.fit(x_train_2, y_train_2)
y_predict_2 = lr_model_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.8456    0.7841    0.8137      2339
           1     0.8666    0.9074    0.8865      3617

    accuracy                         0.8590      5956
   macro avg     0.8561    0.8457    0.8501      5956
weighted avg     0.8584    0.8590    0.8579      5956

              precision    recall  f1-score   support

           0     0.8426    0.7214    0.7773      2337
           1     0.8354    0.9129    0.8724      3618

    accuracy                         0.8378      5955
   macro avg     0.8390    0.8172    0.8249      5955
weighted avg     0.8382    0.8378    0.8351      5955



In [46]:
from sklearn.svm import SVC

svc_model_1 = SVC(decision_function_shape = 'ovo', verbose=10 )
svc_model_1.fit(x_train_1, y_train_1)

y_predict_1 = svc_model_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



svc_model_2 = SVC(decision_function_shape = 'ovo', verbose=10 )
svc_model_2.fit(x_train_2, y_train_2)

y_predict_2 = svc_model_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

[LibSVM]              precision    recall  f1-score   support

           0     0.8656    0.8260    0.8453      2339
           1     0.8907    0.9171    0.9037      3617

    accuracy                         0.8813      5956
   macro avg     0.8782    0.8715    0.8745      5956
weighted avg     0.8808    0.8813    0.8808      5956

[LibSVM]              precision    recall  f1-score   support

           0     0.8630    0.7869    0.8232      2337
           1     0.8698    0.9193    0.8938      3618

    accuracy                         0.8673      5955
   macro avg     0.8664    0.8531    0.8585      5955
weighted avg     0.8671    0.8673    0.8661      5955



In [47]:
from sklearn import tree

clf_1 = tree.DecisionTreeClassifier()
clf_1.fit(x_train_1, y_train_1)

y_predict_1 = clf_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



clf_2 = tree.DecisionTreeClassifier()

clf_2.fit(x_train_2, y_train_2)

y_predict_2 = clf_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.6383    0.6541    0.6461      2339
           1     0.7727    0.7603    0.7664      3617

    accuracy                         0.7186      5956
   macro avg     0.7055    0.7072    0.7063      5956
weighted avg     0.7199    0.7186    0.7192      5956

              precision    recall  f1-score   support

           0     0.6305    0.6346    0.6325      2337
           1     0.7630    0.7598    0.7614      3618

    accuracy                         0.7107      5955
   macro avg     0.6968    0.6972    0.6970      5955
weighted avg     0.7110    0.7107    0.7108      5955



In [48]:
from sklearn.naive_bayes import GaussianNB

gnb_1 = GaussianNB()
gnb_1.fit(x_train_1, y_train_1)

y_predict_1 = gnb_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



gnb_2 = GaussianNB()

gnb_2.fit(x_train_2, y_train_2)

y_predict_2 = gnb_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.6353    0.3814    0.4766      2339
           1     0.6821    0.8584    0.7602      3617

    accuracy                         0.6711      5956
   macro avg     0.6587    0.6199    0.6184      5956
weighted avg     0.6637    0.6711    0.6488      5956

              precision    recall  f1-score   support

           0     0.6331    0.4142    0.5008      2337
           1     0.6907    0.8449    0.7601      3618

    accuracy                         0.6759      5955
   macro avg     0.6619    0.6296    0.6304      5955
weighted avg     0.6681    0.6759    0.6583      5955



In [13]:
from sklearn.ensemble import RandomForestClassifier
rfc_1 = RandomForestClassifier(max_depth=None, random_state=0)
rfc_1.fit(x_train_1, y_train_1)

y_predict_1 = rfc_1.predict(x_test_1)

print(classification_report(y_test_1, y_predict_1, labels= [0,1], digits=4))



rfc_2 = RandomForestClassifier(max_depth=None, random_state=0)

rfc_2.fit(x_train_2, y_train_2)

y_predict_2 = rfc_2.predict(x_test_2)

print(classification_report(y_test_2, y_predict_2, labels= [0,1], digits=4))

              precision    recall  f1-score   support

           0     0.8143    0.7631    0.7879      2339
           1     0.8528    0.8875    0.8698      3617

    accuracy                         0.8387      5956
   macro avg     0.8336    0.8253    0.8289      5956
weighted avg     0.8377    0.8387    0.8376      5956

              precision    recall  f1-score   support

           0     0.8112    0.7206    0.7632      2337
           1     0.8317    0.8917    0.8606      3618

    accuracy                         0.8245      5955
   macro avg     0.8214    0.8061    0.8119      5955
weighted avg     0.8236    0.8245    0.8224      5955

