In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [13]:
df = pd.read_csv('data/hatespeech_dataset.csv')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53745 entries, 0 to 53744
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   free_text  53743 non-null  object
 1   label_id   53745 non-null  int64 
 2   dataset    53745 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [15]:
null = df[df['free_text'].isnull()]

In [16]:
null

Unnamed: 0,free_text,label_id,dataset
10950,,1,vihsd
20880,,0,vihsd


In [17]:
df = df.dropna()

In [18]:
clean = df[df['label_id']==0]


In [19]:
hate = df[df['label_id']==1]


In [20]:
len(clean)

46237

In [21]:
clean.head()

Unnamed: 0,free_text,label_id,dataset
0,Em được làm fan cứng luôn rồi nè ❤️ reaction q...,0,vihsd
2,Đậu Văn Cường giờ giống thằng sida hơn à,0,vihsd
4,Từ lý thuyết đến thực hành là cả 1 câu chuyện ...,0,vihsd
5,Coronavirus is manmade,0,vihsd
6,Đố chúng m nhận ra ai,0,vihsd


In [22]:
len(hate)

7506

In [23]:
hate.head()

Unnamed: 0,free_text,label_id,dataset
1,Đúng là bọn mắt híp lò xo thụt :))) bên việt n...,1,vihsd
3,CÔN ĐỒ CỤC SÚC VÔ NHÂN TÍNH ĐỀ NGHI VN. NHÀ NƯ...,1,vihsd
7,Lúp lúp như chó .,1,vihsd
9,"Thế mà mình nói mấy thằng bắc kì, bọn đó lại b...",1,vihsd
16,Loại này cho dựa cột thôi chứ độ thế nào,1,vihsd


In [25]:
def lower_text(text:str):
    return text.lower()

In [26]:
import re

In [27]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [28]:
def preprocess(text:str):
    text = lower_text(text)
    text = remove_emoji(text)
    return text.strip()

In [29]:
preprocess('m fan ng nè ❤️ reaction')

'm fan ng nè  reaction'

In [30]:
df['free_text'] = df['free_text'].apply(lambda x: preprocess(x))

In [31]:
df.to_csv('data/clean_hatespeech.csv')

In [32]:
df = df.sample(frac=1, random_state=42)

In [67]:
_clean = df[df['label_id']==0]
_hate = df[df['label_id']==1]

In [68]:
len(_clean)

46237

In [69]:
len(_hate)

7506

In [33]:
train_df, _test_df = train_test_split(df, test_size=0.4, random_state=42)


In [34]:
len(train_df)

32245

In [35]:
len(_test_df)

21498

In [36]:
test_df, val_df = train_test_split(_test_df, test_size=0.5, random_state=42)


In [37]:
len(test_df)

10749

In [38]:
len(val_df)

10749

In [39]:
train_clean = train_df[train_df['label_id']==0]
train_hate = train_df[train_df['label_id']==1]


In [61]:
test_clean = test_df[test_df['label_id']==0]
test_hate = test_df[test_df['label_id']==1]

In [62]:
val_clean = val_df[val_df['label_id']==0]
val_hate = val_df[val_df['label_id']==1]

In [40]:
len(train_clean)

27708

In [41]:
len(train_hate)

4537

In [63]:
len(test_clean)

9254

In [64]:
len(test_hate)

1495

In [65]:
len(val_clean)

9275

In [66]:
len(val_hate)

1474

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer


In [43]:
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()



In [44]:
train_df

Unnamed: 0,free_text,label_id,dataset
11453,@linda bui lên mở clip cô minh hiếu nấu vàng đ...,0,vihsd
11299,"no no, chỉ có bắc kì thôi nhé, nam k kì",0,vihsd
48561,"""hàng order hộp hộp còn hộp lê_thảo""",0,vlsp
6302,hương him cùng cảnh ngộ phùng khắc hoàng linh,0,vihsd
43423,"""lớp tiếng hàn còn nhận học_sinh không""",0,vlsp
...,...,...,...
1544,tại chị nhung dung dăng dung dẻ mà mình phải đ...,0,vihsd
28563,cảm động rớt nước mắt:v,0,vihsd
53342,"""viết về nha trang nói nhiều về phan_thiết""",0,vlsp
40095,"""thử lần bật ngửa""",0,vlsp


In [45]:
X_train = train_df.free_text

In [46]:
X_train.shape

(32245,)

In [47]:
y_train = train_df.label_id

In [48]:
y_train.values.shape

(32245,)

In [49]:
X_test = test_df.free_text

In [50]:
y_test = test_df.label_id

In [51]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X_train)

In [52]:
X_counts.shape

(32245, 25303)

In [53]:
nb_clf = MultinomialNB()
nb_clf.fit(X_counts, y_train.values)

In [54]:
X_test_counts = count_vect.transform(X_test)

In [55]:
X_test_counts.shape

(10749, 25303)

In [56]:
y_pred = nb_clf.predict(X_test_counts)

In [57]:
y_pred.shape

(10749,)

In [58]:
y_pred.shape

(10749,)

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [60]:
poly_accuracy = accuracy_score(y_test, y_pred)
poly_f1 = f1_score(y_test, y_pred, average='weighted')
print('Mul_Bayes acc): ', "%.2f" % (poly_accuracy*100))
print('Mul_Bayes f1): ', "%.2f" % (poly_f1*100))

Mul_Bayes acc):  89.64
Mul_Bayes f1):  89.53


In [201]:
train_df.to_csv('train_hatespeech.csv', index=False)

In [202]:
test_df.to_csv('test_hatespeech.csv', index=False)

In [203]:
val_df.to_csv('val_hatespeech.csv', index=False)


In [250]:
import joblib

In [251]:
model_file = 'bayes.model'
joblib.dump(nb_clf, model_file)

['bayes.model']

In [252]:
loaded_model = joblib.load(model_file)

In [253]:
text = 'địt cái lồn mẹ mày'

In [254]:
text_vectorized = count_vect.transform([text])

In [255]:
loaded_model.predict(text_vectorized)

array([2])

In [256]:
vectorizer_filename = "count_vectorizer.joblib"
joblib.dump(count_vect, vectorizer_filename)

['count_vectorizer.joblib']