In [None]:
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install vncorenlp

In [2]:
# import thư viện cần thiết
import numpy as np
import torch
import os
import re
import torch
import csv
import pandas as pd
import json
import pickle
import argparse
import random
from nltk.tokenize import TweetTokenizer
from vncorenlp import VnCoreNLP
from tqdm import tqdm
tqdm.pandas()
from tqdm import tqdm_notebook
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from transformers import *
from transformers.modeling_utils import * 
from transformers import RobertaConfig, RobertaForSequenceClassification, AdamW
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Preprocessing

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
def save_checkpoint(model, tokenizer, checkpoint_path, epoch='best'):
    torch.save(model.state_dict(), os.path.join(checkpoint_path, f'model_{epoch}.bin'))
    # save configurations
    model.config.to_json_file(os.path.join(checkpoint_path, 'config.json'))
    # save exact vocabulary utilized
    tokenizer.save_vocabulary(checkpoint_path)

In [6]:
def isnan(s):
    return s != s

In [7]:
def normalizePost(post, tweet_tokenizer, vncorenlp, use_segment=False, remove_punc_stopword=False):
    tokens = tweet_tokenizer.tokenize(post.replace("’", "'").replace("…", "..."))
    post = " ".join(tokens)
    if use_segment:
        tokens = vncorenlp.tokenize(post.replace("’", "'").replace("…", "..."))
        tokens = [t for ts in tokens for t in ts]
    normPost = " ".join(tokens)

    normPost = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normPost)
    normPost = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normPost)
    normPost = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normPost)
    if use_segment:
        normPost = normPost.replace('< url >', '<url>')
        normPost = re.sub(r"# (\w+)", r'#\1', normPost)

    return normPost

In [8]:
# convert from string to tokens
def convert_lines(sents, vocab, bpe, max_sequence_length):
    output_ids = []
    for sent in sents:
        subwords = '<s> ' + bpe.encode(sent) + ' </s>'
        encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
        output_ids.append(encoded_sent)
    
    output_ids = pad_sequences(output_ids, maxlen=max_sequence_length, dtype="long", value=0, truncating="post", padding="post")

    return output_ids

In [9]:
# tạo một mask gồm các giá trị 0, 1 để làm đầu vào cho thư viện transformers, 
# mask này cho biết các giá trị nào của chuỗi đã được padding
def creat_mask(ids):
  output_masks = []
  for sent in ids:
    mask = [int(token_id > 0) for token_id in sent]
    output_masks.append(mask)
  
  return output_masks

In [10]:
# Load data đưa vào model
def dataloader(ids, labels, masks, batch_size = 16):
  inputs = torch.tensor(ids)
  labels = torch.tensor(labels)
  masks = torch.tensor(masks)

  data = TensorDataset(inputs, masks, labels)
  sampler = SequentialSampler(data)
  dataloader = DataLoader(data, sampler=sampler,batch_size=batch_size)

  return dataloader

In [11]:
train_df = pd.read_csv('/content/drive/MyDrive/NCKH/Fake News Detection/datasets/public_train.csv')

In [12]:
train_df

Unnamed: 0,id,user_name,post_message,timestamp_post,num_like_post,num_comment_post,num_share_post,label
0,1,389c669730cb6c54314a46be785cea42,"THĂNG CẤP BẬC HÀM ĐỐI VỚI 2 CÁN BỘ, CHIẾN SỸ H...",1585945439,19477,378,173.0,0
1,2,775baa6d037b6d359b229a656eaeaf08,<URL>,1588939166.0,11,5,3,0
2,3,b9f3394d2aff86d85974f5040c401f08,TƯ VẤN MÙA THI: Cách nộp hồ sơ để trúng tuyển ...,1591405213,48,5,19.0,0
3,4,808e278b22ec6b96f2faf7447d10cd8e,Cơ quan Cạnh tranh và Thị trường Anh quyết địn...,1592023613,3,0,0.0,0
4,5,f81bdd6d8be4c5f64bb664214e47aced,Thêm 7 ca tại Quảng Nam liên quan đến hành khá...,1583737358,775,0,54.0,0
...,...,...,...,...,...,...,...,...
4367,4368,20933f35ef5d22b4d8193cc269c8ff1e,BÀ MẸ VIỆT NAM ANH HÙNG 95 TUỔI MAY KHẨU TRANG...,1584795126.0,5800,1300,12000,0
4368,4369,a117312f796a22e364b8e241b8cb91eb,Nguồn cung khan kiếm nhưng nhu cầu cao tạo áp ...,1590645643,21,1,,0
4369,4370,547ba1b4f95ec07f2cdada24a6eec693,Lời cảnh tỉnh cho các thanh niên dân TỔ...tốc ...,1589774421.0,3,1,,1
4370,4371,acb4a36d6247a0c89dac880725b2b3a0,Đến bây giờ mới biết chỉ cần học lái xe hạng B...,1589551407.0,144,38,87,1


In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_name())
else:
    device = torch.device('cpu')

Tesla P100-PCIE-16GB


In [14]:
parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="/content/drive/MyDrive/NCKH/Fake News Detection/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("/content/drive/MyDrive/NCKH/Fake News Detection/PhoBERT_base_transformers/dict.txt")

In [15]:
from vncorenlp import VnCoreNLP

# init external tools
vncorenlp = VnCoreNLP('/content/drive/MyDrive/NCKH/Fake News Detection/vncorenlp/VnCoreNLP-1.1.1.jar', annotators='wseg')
tweet_tokenizer = TweetTokenizer()

In [16]:
# process training set
error_label_idx = []
tr_texts = []
for i, post in enumerate(train_df.post_message):
    if not isnan(post):
        tr_texts.append(normalizePost(post, tweet_tokenizer, vncorenlp, use_segment=True,
                                      remove_punc_stopword=True))
    else:
        error_label_idx.append(i)

In [17]:
tr_labels = train_df.iloc[~train_df.index.isin(error_label_idx)].label.to_list()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(tr_texts, tr_labels, test_size=0.1)

In [20]:
MAX_SEQUENCE_LENGTH = 256
X_train_ids = convert_lines(X_train, vocab, bpe, MAX_SEQUENCE_LENGTH)
print('Shape of X_train: {}'.format(X_train_ids.shape))

Shape of X_train: (3933, 256)


In [21]:
X_train_ids[1]

array([    0,   321,   326,  5326,   166,    76,     4,   184,   331,
           8,    41,   203,    16,  1150,  2086,  2591,     6,  1082,
         525, 25877,    20, 50456,    19,   549,     5,  6593,     4,
         567,  2000,  2529,   525,    20,   479,  4932,     6,  4854,
          19,    62,   370,   583,    30,   525,   109,    12,   764,
           2,     2,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Model

In [22]:
# Load model
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    "/content/drive/MyDrive/NCKH/Fake News Detection/PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
)
BERT_FND = RobertaForSequenceClassification.from_pretrained(
    "/content/drive/My Drive/NCKH/Fake News Detection/PhoBERT_base_transformers/model.bin",
    config=config
)

BERT_FND.cuda()
print('Done')

Some weights of the model checkpoint at /content/drive/My Drive/NCKH/Fake News Detection/PhoBERT_base_transformers/model.bin were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the mo

Done


In [23]:
# Evaluate
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [24]:
# Creating optimizer and lr schedulers
device = 'cuda'
epochs = 10
batch_size = 16
accumulation_steps = 5

param_optimizer = list(BERT_FND.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
num_train_optimization_steps = int(epochs*len(X_train_ids)/batch_size/accumulation_steps)

#Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, 
#after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
#Create a schedule with a constant learning rate, using the learning rate set in optimizer
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler

In [27]:
# path to save the weights
ckpt_path = '/content/drive/MyDrive/NCKH/Fake News Detection/weights'
if not os.path.exists(ckpt_path):
    os.mkdir(ckpt_path)

In [28]:
# Use K-fold cross-validation
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train_ids, y_train))

## Training

In [29]:
for fold, (train_idx, val_idx) in enumerate(splits):
    print("Training for fold {}".format(fold))
    best_score = 0
   
    tq = tqdm(range(fold))
    
    for epoch in tq:
        train_ids, train_labels = np.array(X_train_ids)[train_idx], np.array(y_train)[train_idx]
        val_ids, val_labels = np.array(X_train_ids)[val_idx], np.array(y_train)[val_idx]
        train_masks = creat_mask(train_ids)
        val_masks = creat_mask(val_ids)
        train_dataloader = dataloader(train_ids, train_labels, train_masks)
        val_dataloader = dataloader(val_ids, val_labels, val_masks)

        print('Training...')

        total_loss = 0
        BERT_FND.train()
        train_accuracy = 0
        nb_train_steps = 0
        train_f1 = 0
        
        for step, batch in tqdm_notebook(enumerate(train_dataloader)):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            BERT_FND.zero_grad()
            outputs = BERT_FND(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()
            
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
            train_accuracy += tmp_train_accuracy
            train_f1 += tmp_train_f1
            nb_train_steps += 1
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(BERT_FND.parameters(), 1.0)
            optimizer.step()
            
        avg_train_loss = total_loss / len(train_dataloader)
        print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
        print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
        print(" Average training loss: {0:.4f}".format(avg_train_loss))

        print("Validation...")
        BERT_FND.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        eval_f1 = 0
        for batch in tqdm_notebook(val_dataloader):

            batch = tuple(t.to(device) for t in batch)

            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                outputs = BERT_FND(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)
                logits = outputs[0]
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

                eval_accuracy += tmp_eval_accuracy
                eval_f1 += tmp_eval_f1
                nb_eval_steps += 1
        print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
        print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
        
        score = eval_f1/nb_eval_steps
        if score >= best_score:
            torch.save(BERT_FND.state_dict(),os.path.join(ckpt_path, f"model_{fold}.bin"))
            best_score = score

0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Training for fold 0
Training for fold 1
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.8782
 F1 score: 0.7072
 Average training loss: 0.3041
Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.8788
 F1 score: 0.6689


100%|██████████| 1/1 [01:33<00:00, 93.09s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

Training for fold 2
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9204
 F1 score: 0.8271
 Average training loss: 0.2233
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.8988
 F1 score: 0.7407


 50%|█████     | 1/2 [01:32<01:32, 92.79s/it]

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9518
 F1 score: 0.8995
 Average training loss: 0.1593
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.9437
 F1 score: 0.8903


100%|██████████| 2/2 [03:04<00:00, 92.50s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

Training for fold 3
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9597
 F1 score: 0.9176
 Average training loss: 0.1422
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.9650
 F1 score: 0.8989


 33%|███▎      | 1/3 [01:32<03:05, 92.98s/it]

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9759
 F1 score: 0.9525
 Average training loss: 0.1011
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.9762
 F1 score: 0.9390


 67%|██████▋   | 2/3 [03:06<01:33, 93.05s/it]

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9829
 F1 score: 0.9685
 Average training loss: 0.0721
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

100%|██████████| 3/3 [04:35<00:00, 91.86s/it]
  0%|          | 0/4 [00:00<?, ?it/s]


 Accuracy: 0.9625
 F1 score: 0.9003
Training for fold 4
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9854
 F1 score: 0.9596
 Average training loss: 0.0609
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.9738
 F1 score: 0.9512


 25%|██▌       | 1/4 [01:42<05:07, 102.46s/it]

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9902
 F1 score: 0.9755
 Average training loss: 0.0448
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


 Accuracy: 0.9862
 F1 score: 0.9769


 50%|█████     | 2/4 [03:15<03:19, 99.55s/it] 

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9921
 F1 score: 0.9784
 Average training loss: 0.0351
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

 75%|███████▌  | 3/4 [04:44<01:36, 96.48s/it]


 Accuracy: 0.9688
 F1 score: 0.9455
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9940
 F1 score: 0.9808
 Average training loss: 0.0232
Validation...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

100%|██████████| 4/4 [06:13<00:00, 93.42s/it]


 Accuracy: 0.9825
 F1 score: 0.9673





## Test

In [30]:
test_ids = convert_lines(X_test, vocab, bpe, MAX_SEQUENCE_LENGTH)
test_masks = creat_mask(test_ids)
test_dataloader = dataloader(test_ids, y_test, test_masks)

In [31]:
predictions , true_labels = [], []
eval_accuracy = 0
eval_f1 = 0
nb_eval_steps = 0
# preds = []
for batch in test_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    outputs = BERT_FND(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  
  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

  predictions.append(logits)
  true_labels.append(label_ids)
  eval_accuracy += tmp_eval_accuracy
  eval_f1 += tmp_eval_f1
  nb_eval_steps += 1
print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

 Accuracy: 0.9115
 F1 score: 0.7991


In [32]:
predict = np.argmax(predictions[1], axis=1).flatten()

In [33]:
predict

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [34]:
true_labels[1]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])