In [2]:
from dataset import dataset
from datasets import Dataset # converting df to transformer dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, ConfusionMatrixDisplay
import numpy as np
import os
import torch
import warnings
warnings.filterwarnings("ignore")
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"




In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_id = 0 if str(device) == 'cuda' else -1

FNC_PATH="../../milestone 2/baseline/dataset/FNC-1"
LIAR_PATH="../../milestone 2/baseline/dataset/LIAR/"

ds = dataset(FNC_PATH=FNC_PATH, LIAR_PATH=LIAR_PATH, word2vec=False)
train_df, val_df, test_df = ds(dataset="LIAR", all = True)

FNC
	train      has 39978 rows and shapes (39978, 5).
	validation has  4997 rows and shapes (4997, 5).
	test       has  4997 rows and shapes (4997, 5).
LIAR
	train      has 10240 rows and shapes (10240, 14).
	validation has  1284 rows and shapes (1284, 14).
	test       has  1267 rows and shapes (1267, 14).


In [4]:
bert_model = pipeline('fill-mask', model='bert-base-cased', device=device_id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# utils 
def get_prediction(bert_respond)->int:
    return 1 if sorted(bert_respond, key = lambda x: x['score'])[-1]['token_str'] == "real" else 0

In [6]:
train_X = train_df['statement']
train_y = train_df['label'].astype(int).to_numpy()

val_X = val_df['statement']
val_y = val_df['label'].astype(int).to_numpy()

test_X = test_df['statement']
test_y = test_df['label'].astype(int).to_numpy()

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
def filter_long_sample(X_df, y_df, print_head = ""):
    tokenized_X = X_df.apply(lambda x: tokenizer(x))
    too_long    = tokenized_X.apply(lambda x: len(x['input_ids']) > 512)
    print(f"{print_head} filtered {sum(too_long)} sample(s), remain {sum(~too_long)} samples.")
    return X_df[~too_long], y_df[~too_long]

train_X, train_y = filter_long_sample(train_X, train_y, print_head="Train dataset: ")
val_X, val_y     = filter_long_sample(val_X, val_y, print_head="Val dataset: ")
test_X, test_y   = filter_long_sample(test_X, test_y, print_head="Test dataset: ")

Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors


Train dataset:  filtered 1 sample(s), remain 10239 samples.
Val dataset:  filtered 0 sample(s), remain 1284 samples.
Test dataset:  filtered 1 sample(s), remain 1266 samples.


In [8]:
# basic bert without training
result = test_X.apply(lambda x: bert_model(f"{x} This is a [MASK] news.", targets=["real", "fake"]) )
prediction = result.apply(lambda x: get_prediction(x) ) 

In [9]:
print(f"Basic Bert Result: ")
print(f"\tAccuracy: {accuracy_score(test_y, prediction.values):.4f}")
print(f"\tF1 score: {f1_score(test_y, prediction.values):.4f}")

Basic Bert Result: 
	Accuracy: 0.4115
	F1 score: 0.5178


In [10]:
# training a Bert
classification_head_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# create dataset
def tokenizer_function(row):
    result = tokenizer(row["text"], padding = "max_length", truncation = True, max_length = tokenizer.model_max_length)
    return result

data = {"text" : train_X,
        "label" : train_y}
train_dataset = Dataset.from_dict(data)
train_dataset = train_dataset.map( tokenizer_function )

data = {"text" : val_X,
        "label" : val_y}
val_dataset = Dataset.from_dict(data)
val_dataset = val_dataset.map( tokenizer_function )

Map:   0%|          | 0/10239 [00:00<?, ? examples/s]

Map: 100%|██████████| 10239/10239 [00:11<00:00, 904.15 examples/s]
Map: 100%|██████████| 1284/1284 [00:01<00:00, 914.24 examples/s]


In [12]:
def compute_metrics(eval_pred):
    probabilities, labels = eval_pred
    index = np.argmax(probabilities, axis = 1)
    accuracy = np.mean(index == labels)

    f1 = f1_score(labels, index)
    return {"accuracy": accuracy, "f1 score": f1}

In [13]:
training_args = TrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir="training",
    learning_rate=5e-05,
    num_train_epochs=3.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    full_determinism=False
)

trainer = Trainer(
    model=classification_head_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [14]:

trainer.train()

  0%|          | 1/3840 [06:48<436:02:08, 408.89s/it]