In [1]:
import numpy as np 
import pandas as pd

In [2]:
from utils import *
from preprocessing import *

In [3]:
# load data using utf8 format

data = pd.read_csv('./datasets/combined_datasets.csv', encoding='utf-8')

In [4]:
# fill 0 to num_like_post, num_commnet_post, num_share_post
data.num_like_post.fillna(0, inplace=True)
data.num_comment_post.fillna(0, inplace=True)
data.num_share_post.fillna(0, inplace=True)
data = data.dropna(subset=['post_message'], how='any').reset_index(drop=True)

# check again
check_like = data.num_like_post.isna().sum()
check_commnet = data.num_comment_post.isna().sum()
check_share = data.num_share_post.isna().sum()
check_post_message = data.post_message.isna().sum()

print(f'num_like_post has nan {check_like != 0}')
print(f'num_comment_post has nan {check_commnet != 0}')
print(f'num_share_post has nan {check_share != 0}')
print(f'post_message has nan {check_post_message != 0}')

num_like_post has nan False
num_comment_post has nan False
num_share_post has nan False
post_message has nan False


In [5]:
# drop id, user_name, timestamp_post columns
data = data.drop(['timestamp_post', 'Unnamed: 0'], axis=1).reset_index(drop=True)

# label encoding for user_name
from sklearn.preprocessing import LabelEncoder
labels = data.user_name.values

le = LabelEncoder()
data['user_name_labelEncoder'] = pd.Series(le.fit_transform(labels))

# drop duplicate samples has <URL>, [<URL>](<URL>)
char_drop = ['<URL>', '[<URL>](<URL>)']
index = []
for char in char_drop:
    index.extend(list(data[data.post_message == char].index))
data = data.drop(index, axis=0).reset_index(drop=True)

# view data
data.head(5)

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173.0,0,1032
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19.0,0,2804
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0.0,0,2044
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54.0,0,3575
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0.0,0,3701


In [6]:
count = data.user_name.value_counts()
keys = count.index.values.tolist()
values = count.values.tolist()

keys_values = zip(keys, values)
lookup = dict(keys_values)

user_name_freq = [lookup[x] for x in data.user_name.values]
data['user_name_freq'] = pd.Series(user_name_freq)

data.head(5)

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173.0,0,1032,58
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19.0,0,2804,1
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0.0,0,2044,64
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54.0,0,3575,1
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0.0,0,3701,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5111 entries, 0 to 5110
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      5111 non-null   int64 
 1   user_name               5111 non-null   object
 2   post_message            5111 non-null   object
 3   num_like_post           5111 non-null   object
 4   num_comment_post        5111 non-null   object
 5   num_share_post          5111 non-null   object
 6   label                   5111 non-null   int64 
 7   user_name_labelEncoder  5111 non-null   int32 
 8   user_name_freq          5111 non-null   int64 
dtypes: int32(1), int64(3), object(5)
memory usage: 339.5+ KB


In [8]:
index = []
for i in range(len(data)):
    try:
        int(float(data.num_comment_post[i]))
        int(float(data.num_like_post[i]))
        int(float(data.num_share_post[i]))
    except:
        index.append(i)
data_temp = data.iloc[index,:]
data_temp

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq
34,36,fe8176d5bc081419f9b08e195525c131,Ng∆∞·ªùi m·∫∑t ƒëen mi·ªáng ng·∫≠m 2 qu·∫£ g√¨ ƒë√≥ tay c·∫ßm ƒë...,unknown,unknown,unknown,1,3687,1
76,78,45d80e8d62eac8e3f28a8c822fab002d,M·ªçi ng∆∞·ªùi ra ƒë∆∞·ªùng nh·ªõ ƒëeo kh·∫©u trang nha. ƒê√£ ...,unknown,unknown,unknown,1,1210,1
80,82,f8584733a5e8e99a9b45ebb5400e796e,MU nhu·ªôm ƒë·ªè th√†nh Manchester,Solskjaer h·∫° knock-out Guardiola,Juventus b·ªè xa Inter Milan... l√† nh·ªØng ·∫£nh ch...,1583713929.0,0,3582,1
174,178,0adfdeb881147078b0c50354193d67dd,Ca Ph·∫´u Thu·∫≠t th·∫•t b·∫°i. Ho√†i Linh l√¢m v√†o t√¨nh...,unknown,unknown,unknown,1,340,1
185,190,a2ed99660efc25ebf7dd8947c046fd4c,H√† N·ªôi ƒë√£ ho√£n gi·∫£i ƒëua F1 do d·ªãch Covid-19,29,1 comment,0,0,2518,1
...,...,...,...,...,...,...,...,...,...
4494,183,3498669648858314752,V·ª´a c·∫≠p nh·∫≠t dc 13 h·ªçc sinh l·ªõp 1 tr∆∞·ªùng H∆∞∆°ng...,unknown,unknown,unknown,1,977,1
4496,185,3380197056197186048,S·ª£ th·∫≠t. D·ªãch covid ƒë√£ v·ªÅ ƒë·∫øn H∆∞ng Nguy√™n v√† N...,unknown,unknown,unknown,1,957,1
4497,186,-6852763066036545536,B√ÄI THU·ªêC M·ªñI S√ÅNG KHI·∫æN T√îI KH·ªéE M·∫†NH\n\nKhuy...,10,5,1 share,1,163,1
4509,198,-6478801688689956864,VTV1 C≈®NG ƒê√É N√ìI R·ªíI N√äN M·ªåI NG∆Ø·ªúI C·ª® L√ÄM NHA....,unknown,unknown,unknown,1,152,1


In [9]:
index_drop = []
for i in range(len(data)):
    try:
        int(float(data.num_comment_post[i]))
        int(float(data.num_like_post[i]))
        int(float(data.num_share_post[i]))
    except:
        if int(float(data.label[i])) == 0:
            index_drop.append(i)
        else:
            data.num_comment_post[i] = 0
            data.num_like_post[i] = 0
            data.num_share_post[i] = 0

data = data.drop(index=index_drop, axis=0).reset_index(drop=True)
data

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173.0,0,1032,58
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19.0,0,2804,1
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0.0,0,2044,64
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54.0,0,3575,1
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0,0,3701,1
...,...,...,...,...,...,...,...,...,...
5095,795,2490795936720330752,"ƒê·ªãnh c∆∞ g·∫ßn 60 nƒÉm, trong ƒë√≥ 20 nƒÉm b·ªã ""treo"" ...",0,0,0,0,757,1
5096,796,1846872478075934976,"Lo·∫°n n√£o qu√°\n\nTheo b√°o ƒëi·ªán t·ª≠ Giao th√¥ng, p...",43,24,3,0,583,2
5097,797,-4723502351165939712,https://m.baophapluat.vn/ban-doc/vu-an-tranh-c...,3,0,0,0,94,2
5098,798,7717037201428829184,"C·ª•c B·∫£o v·ªá th·ª±c v·∫≠t cho r·∫±ng, ƒë·ªÅ xu·∫•t t·∫°m th·ªùi...",13,2,0,0,1910,7


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      5100 non-null   int64 
 1   user_name               5100 non-null   object
 2   post_message            5100 non-null   object
 3   num_like_post           5100 non-null   object
 4   num_comment_post        5100 non-null   object
 5   num_share_post          5100 non-null   object
 6   label                   5100 non-null   int64 
 7   user_name_labelEncoder  5100 non-null   int32 
 8   user_name_freq          5100 non-null   int64 
dtypes: int32(1), int64(3), object(5)
memory usage: 338.8+ KB


In [11]:
# convert post_message, num_like_post, num_comment_post, num_share_post to int64
columns = ['num_like_post', 'num_comment_post', 'num_share_post', 'label', 'user_name_labelEncoder', 'user_name_freq']

data[columns] = data[columns].apply(pd.to_numeric).astype('int64')

data

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173,0,1032,58
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19,0,2804,1
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0,0,2044,64
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54,0,3575,1
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0,0,3701,1
...,...,...,...,...,...,...,...,...,...
5095,795,2490795936720330752,"ƒê·ªãnh c∆∞ g·∫ßn 60 nƒÉm, trong ƒë√≥ 20 nƒÉm b·ªã ""treo"" ...",0,0,0,0,757,1
5096,796,1846872478075934976,"Lo·∫°n n√£o qu√°\n\nTheo b√°o ƒëi·ªán t·ª≠ Giao th√¥ng, p...",43,24,3,0,583,2
5097,797,-4723502351165939712,https://m.baophapluat.vn/ban-doc/vu-an-tranh-c...,3,0,0,0,94,2
5098,798,7717037201428829184,"C·ª•c B·∫£o v·ªá th·ª±c v·∫≠t cho r·∫±ng, ƒë·ªÅ xu·∫•t t·∫°m th·ªùi...",13,2,0,0,1910,7


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      5100 non-null   int64 
 1   user_name               5100 non-null   object
 2   post_message            5100 non-null   object
 3   num_like_post           5100 non-null   int64 
 4   num_comment_post        5100 non-null   int64 
 5   num_share_post          5100 non-null   int64 
 6   label                   5100 non-null   int64 
 7   user_name_labelEncoder  5100 non-null   int64 
 8   user_name_freq          5100 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 358.7+ KB


In [13]:
# preprocessing post_message uses stopword
from vncorenlp import VnCoreNLP

stopwords_file = "vietnamese-stopwords/vietnamese-stopwords.txt"

stopwords = get_stop_words(stopwords_file)

annotator = VnCoreNLP("VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

data['post_message_preproced'] = data.post_message.apply(text_preprocessing, stopwords= stopwords, annotator= annotator)

data.head(5)

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq,post_message_preproced
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173,0,1032,58,thƒÉng c·∫•p_b·∫≠c h√†m c√°n_b·ªô chi·∫øn_s·ªπ hy_sinh ƒë√†_n...
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19,0,2804,1,t∆∞_v·∫•n m√πa thi n·ªôp h·ªì_s∆° tr√∫ng_tuy·ªÉn ch∆∞∆°ng_tr...
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0,0,2044,64,c∆°_quan c·∫°nh_tranh th·ªã_tr∆∞·ªùng quy·∫øt_ƒë·ªãnh ƒëi·ªÅu_...
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54,0,3575,1,ca qu·∫£ng_nam h√†nh_kh√°ch chuy·∫øn bay b·ªánh_nh√¢n n...
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0,0,3701,1,h·ªçc th·ªÉ d·ª• c th·∫ßy_gi√°o nguy·ªÖn_vƒÉn qu√¢n ph·ª• tr√°...


In [14]:
# preprocessing post_message don't eliminate stopword
from vncorenlp import VnCoreNLP

stopwords_file = "vietnamese-stopwords/custom-vietnamese-stopwords.txt"

stopwords = get_stop_words(stopwords_file)

annotator = VnCoreNLP("VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

data['post_message_notStopword'] = data.post_message.apply(text_preprocessing, stopwords= stopwords, annotator= annotator)

data.head(5)

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq,post_message_preproced,post_message_notStopword
0,1,389c669730cb6c54314a46be785cea42,"THƒÇNG C·∫§P B·∫¨C H√ÄM ƒê·ªêI V·ªöI 2 C√ÅN B·ªò, CHI·∫æN S·ª∏ H...",19477,378,173,0,1032,58,thƒÉng c·∫•p_b·∫≠c h√†m c√°n_b·ªô chi·∫øn_s·ªπ hy_sinh ƒë√†_n...,thƒÉng c·∫•p_b·∫≠c h√†m ƒë·ªëi_v·ªõi c√°n_b·ªô chi·∫øn_s·ªπ hy_s...
1,3,b9f3394d2aff86d85974f5040c401f08,T∆Ø V·∫§N M√ôA THI: C√°ch n·ªôp h·ªì s∆° ƒë·ªÉ tr√∫ng tuy·ªÉn ...,48,5,19,0,2804,1,t∆∞_v·∫•n m√πa thi n·ªôp h·ªì_s∆° tr√∫ng_tuy·ªÉn ch∆∞∆°ng_tr...,t∆∞_v·∫•n m√πa thi c√°ch n·ªôp h·ªì_s∆° ƒë·ªÉ tr√∫ng_tuy·ªÉn c...
2,4,808e278b22ec6b96f2faf7447d10cd8e,C∆° quan C·∫°nh tranh v√† Th·ªã tr∆∞·ªùng Anh quy·∫øt ƒë·ªãn...,3,0,0,0,2044,64,c∆°_quan c·∫°nh_tranh th·ªã_tr∆∞·ªùng quy·∫øt_ƒë·ªãnh ƒëi·ªÅu_...,c∆°_quan c·∫°nh_tranh v√† th·ªã_tr∆∞·ªùng anh quy·∫øt_ƒë·ªãn...
3,5,f81bdd6d8be4c5f64bb664214e47aced,Th√™m 7 ca t·∫°i Qu·∫£ng Nam li√™n quan ƒë·∫øn h√†nh kh√°...,775,0,54,0,3575,1,ca qu·∫£ng_nam h√†nh_kh√°ch chuy·∫øn bay b·ªánh_nh√¢n n...,th√™m ca t·∫°i qu·∫£ng_nam li√™n_quan ƒë·∫øn h√†nh_kh√°ch...
4,6,ffc4b6bab27c40cfc48e4dc8b8a41e42,Trong gi·ªù h·ªçc Th·ªÉ d·ª•‚Äåc do th·∫ßy gi√°o Nguy·ªÖn VƒÉn...,2,1,0,0,3701,1,h·ªçc th·ªÉ d·ª• c th·∫ßy_gi√°o nguy·ªÖn_vƒÉn qu√¢n ph·ª• tr√°...,trong gi·ªù h·ªçc th·ªÉ d·ª• c do th·∫ßy_gi√°o nguy·ªÖn_vƒÉn...


In [15]:
# split train, test dataset
size = int(0.8*len(data))
ids_train = np.random.choice(len(data), replace=False, size=size) # ids of train
train = data.iloc[ids_train]

ids_test = [x not in ids_train for x in range(len(data))]
test = data.iloc[ids_test]

print('Size of train set:', len(train))
print('Size of test set:', len(test))

Size of train set: 4080
Size of test set: 1020


In [16]:
train.to_csv('train.csv', encoding='utf8', index=False)
test.to_csv('test.csv', encoding='utf8', index=False)

In [17]:
data = pd.read_csv('train.csv', encoding='utf8')

In [18]:
# down samples
index_0 = list(data[data.label == 0].index)
index_1 = list(data[data.label == 1].index)

ids = list(np.random.choice(index_0, size=int(1.1*len(index_1)), replace=False))

index = ids + index_1

data.iloc[index]

Unnamed: 0,id,user_name,post_message,num_like_post,num_comment_post,num_share_post,label,user_name_labelEncoder,user_name_freq,post_message_preproced,post_message_notStopword
261,502,7050444021058475008,"""Ghen C√¥ Vy"" l√† 1 d·ª± √°n s√°ng t·∫°o c·ªßa Vi·ªán S·ª©c ...",22000,1000,5500,0,1804,28,ghen vy d·ª±_√°n s√°ng_t·∫°o vi·ªán s·ª©c_kho·∫ª ngh·ªÅ_nghi...,ghen c√¥ vy l√† d·ª±_√°n s√°ng_t·∫°o c·ªßa vi·ªán s·ª©c_kho·∫ª...
1766,1613,ab603852d291d90dde0b1b1a2fd09a2a,"Hnay, H·ªôi ƒë·ªìng th·∫©m ph√°n s·∫Ω ra ph√°n quy·∫øt ƒë·ªëi ...",18,3,1,0,2638,1,hnay h·ªôi_ƒë·ªìng th·∫©m_ph√°n ph√°n_quy·∫øt t·ª≠_t√π h·ªì h·∫£...,hnay h·ªôi_ƒë·ªìng th·∫©m_ph√°n s·∫Ω ra ph√°n_quy·∫øt ƒë·ªëi_v...
220,913,6f0a57960e94a6489a552226c4bf05f1,"üß™üß™üß™ 23h ƒë√™m 19/4, CDC H√† N·ªôi th√¥ng b√°o:\n\nK·∫øt...",9289,50,184,0,1783,1,emoji ƒë√™m cdc h√†_n·ªôi th√¥ng_b√°o k·∫øt_qu·∫£ tr∆∞·ªùng_...,emoji ƒë√™m cdc h√†_n·ªôi th√¥ng_b√°o k·∫øt_qu·∫£ tr∆∞·ªùng_...
3422,1718,dba9802a1bc2c0c07b64c28dc8e99dba,H√† N·ªôi chi 114 t·ª∑ phun n∆∞·ªõc h·∫° nhi·ªát: Nhi·ªÅu qu...,0,0,0,0,3212,1,h√†_n·ªôi chi t·ª∑ phun h·∫° nhi·ªát qu·∫≠n_huy·ªán h·∫πn m√πa...,h√†_n·ªôi chi t·ª∑ phun n∆∞·ªõc h·∫° nhi·ªát nhi·ªÅu qu·∫≠n_hu...
3798,919,8cec85073924e94f5441a22aeebd68df,"ƒê·ªëi v·ªõi ng∆∞·ªùi d√πng c√° nh√¢n, t√†i kho·∫£n tr√™n C·ªïn...",6,0,3,0,2238,1,t√†i_kho·∫£n c·ªïng d·ªãch_v·ª• c√¥ng_qu·ªëc_gia ƒëƒÉng_k√Ω t...,ƒë·ªëi_v·ªõi ng∆∞·ªùi d√πng c√°_nh√¢n t√†i_kho·∫£n tr√™n c·ªïng...
...,...,...,...,...,...,...,...,...,...,...,...
4057,2906,353409ae5197e7441fe99c567b9f6851,Virus Corona v√† L√≤ng Ki√™u Ng·∫°o Trung C·ªông T·∫°i ...,3100,901,3200,1,985,1,virus corona ki√™u_ng·∫°o trung c·ªông virus corona...,virus corona v√† l√≤ng ki√™u_ng·∫°o trung c·ªông t·∫°i_...
4058,250,bb9d7a3eebe8a9e1d765c6ab01fae5c2,Ai m·ªõi l√† ng∆∞·ªùi ƒë√°ng th∆∞∆°ng? -----------------...,567,262,377,1,2833,1,th∆∞∆°ng ƒë·ªçc m·∫•y b√†i_ch·ªâ tr√≠ch d√¢n m·∫°ng t·ªôi ch·ª≠i...,ai m·ªõi l√† ng∆∞·ªùi ƒë√°ng th∆∞∆°ng ƒë·ªçc m·∫•y b√†i_ch·ªâ tr...
4060,3467,f97a7ba6cbfa436206142955a1a2b621,Coppy.\n C·ª® V·ªÆNG B·ªÄN M√Ä TI·∫æN T·ªöI!\n\n Th√°i Lan...,63,145,2,1,3599,1,copy v·ªØng_b·ªÅn ti·∫øn th√°i_lan s·∫£n_xu·∫•t tri·ªáu xe ...,copy c·ª© v·ªØng_b·ªÅn m√† ti·∫øn t·ªõi th√°i_lan m·ªôt nƒÉm ...
4061,2727,bd612a8c9a208033cd7344cba730800a,"Tin ch·ªù x√°c nh·∫≠n: ""Giao l∆∞u Qu·ªëc ph√≤ng"" bi√™n g...",0,0,0,1,2851,1,ch·ªù x√°c_nh·∫≠n giao_l∆∞u qu·ªëc_ph√≤ng bi√™n_gi·ªõi t√¢y...,tin ch·ªù x√°c_nh·∫≠n giao_l∆∞u qu·ªëc_ph√≤ng bi√™n_gi·ªõi...


In [19]:
data.iloc[index].to_csv('data_balance.csv', encoding='utf-8', index=False)