In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [None]:
# dataset of fact-checks including title and article body

In [None]:
import pandas as pd
df1 = pd.read_csv("factchecks.csv")
df = df[['title_body', 'claim']]
df.columns = ['source_text', 'target_text']
df['source_text'] = "misinformation: " + df['source_text']
df = df.astype(str)

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.shape, test_df.shape

In [None]:
# T5 model training

In [None]:
import transformers
import pandas as pd
from datasets import Dataset
import torch

print(transformers.__version__)
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
raw_datasets = Dataset.from_pandas(train_df)
max_input_length = 512
max_target_length = 64
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["source_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# model = model.to("cuda:1")

batch_size = 12 #32
learning_rate = 1e-4
num_train_epochs = 5
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lr_{learning_rate}_{num_train_epochs}ep",
    evaluation_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    save_total_limit=5,
    num_train_epochs=num_train_epochs,
    save_strategy = 'epoch',
    bf16=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
import torch
del model
torch.cuda.empty_cache()

In [None]:
# computing BLEU and ROGUE score

In [None]:
import glob
x1 = glob.glob('t5-base-finetuned-lr_0.0001_5ep/*') # select model folder here
x1 = [it for it in x1 if "checkpoint" in it]
x2 = []
for it in x1:
    x2.append([it.split("/")[-1].split("-")[-1], it])
epochs = sorted(x2)
epochs = [it[1] for it in epochs]
epochs

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
import transformers
transformers.set_seed(0)

filepath = epochs[0]
print(filepath)
tokenizer = AutoTokenizer.from_pretrained(filepath)
model = AutoModelForSeq2SeqLM.from_pretrained(filepath)
model = model.to("cuda:0")
print("model loaded")

In [None]:
import transformers
transformers.set_seed(0)

to_predict = test_df.values
true = []
pred = []

for i in range(len(to_predict)):
    text_input = to_predict[i][0]
    print("Real: ", to_predict[i][1])
    input_ids = tokenizer.encode(
        text_input, return_tensors="pt", add_special_tokens=True, max_length=512
    )
    generated_ids = model.generate(
        input_ids=input_ids.to("cuda:0"),
#         input_ids=input_ids,
        top_k = 25,
        top_p = 0.95,
        do_sample=True,
        num_beams = 1,
        max_length=64,
        num_return_sequences = 1
    )
    p = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("Predicted: ", p)
    print("----------------------------------------------------------------------------------")
    true.append(to_predict[i][1])
    pred.append(p)

In [None]:
from nltk.translate.bleu_score import sentence_bleu
bleu1 = []
bleu2 = []
bleu3 = []
bleu4 = []

for p, t in zip(pred, true):
    reference = [t.lower().split()]
    candidate = p.lower().split()
    score1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    score2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))
    score3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0))
    score4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
    bleu1.append(score1)
    bleu2.append(score2)
    bleu3.append(score3)
    bleu4.append(score4)
def Average(lst):
    return sum(lst) / len(lst)
print(Average(bleu1))
print(Average(bleu2))
print(Average(bleu3))
print(Average(bleu4))

In [None]:
from rouge import Rouge 

rouge = Rouge()
scores = rouge.get_scores(pred, true, avg=True)
xx = pd.DataFrame(scores).T['f'].values
for x in xx: print(x)

In [None]:
# computing selfBLEU 

In [None]:
from datasets import load_metric
import numpy as np
import copy
metric = load_metric("bleu")

In [None]:
import glob
x1 = glob.glob('t5-base-finetuned-lr_0.0001_5ep/*') # select model folder here
x1 = [it for it in x1 if "checkpoint" in it]
x2 = []
for it in x1:
    x2.append([it.split("/")[-1].split("-")[-1], it])
epochs = sorted(x2)
epochs = [it[1] for it in epochs]
epochs

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
import transformers
transformers.set_seed(0)

filepath = epochs[0]
print(filepath)
tokenizer = AutoTokenizer.from_pretrained(filepath)
model = AutoModelForSeq2SeqLM.from_pretrained(filepath)
model = model.to("cuda:0")
print("model loaded")

In [None]:
import transformers
transformers.set_seed(0) 

to_predict = test_df.values
final_score1 = []
final_score2 = []
final_score3 = []


for i in range(len(to_predict)):
    print(i)
    text_input = to_predict[i][0]
    input_ids = tokenizer.encode(
        text_input, return_tensors="pt", add_special_tokens=True, max_length=512
    )
    generated_ids = model.generate(
        input_ids=input_ids.to("cuda:0"),
        top_k = 25,
        top_p = 0.95,
        do_sample=True,
        num_beams = 1,
        max_length=64,
        num_return_sequences = 5
    )
    p = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    def calculate_selfBleu(sentences, order):
        def get_bleu_score(sentence, remaining_sentences, order):
            lst = []
            for i in remaining_sentences:
                preds = [sentence.lower().split()]
                labels = [[i.lower().split()]]
                bleu = metric.compute(predictions=preds, references=labels, max_order=int(order))
                lst.append(bleu['bleu'])
            return lst
        bleu_scores = []
        for i in sentences:
            sentences_copy = copy.deepcopy(sentences)
            remaining_sentences = sentences_copy.remove(i)
            bleu = get_bleu_score(i, sentences_copy, order)
            bleu_scores.append(bleu)
        return np.mean(bleu_scores)
    
    
#     print(p)
    final_score1.append(calculate_selfBleu(p, 1))
    final_score2.append(calculate_selfBleu(p, 2))
    final_score3.append(calculate_selfBleu(p, 3))

#     print(final_score1, final_score2, final_score3)

In [None]:
print(sum(final_score1)/len(final_score1))
print(sum(final_score2)/len(final_score2))
print(sum(final_score3)/len(final_score3))

In [None]:
# generating training data

In [None]:
import glob
x1 = glob.glob('t5-base-finetuned-lr_0.0001_5ep/*') # select model folder here
x1 = [it for it in x1 if "checkpoint" in it]
x2 = []
for it in x1:
    x2.append([it.split("/")[-1].split("-")[-1], it])
epochs = sorted(x2)
epochs = [it[1] for it in epochs]
epochs

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
import transformers
transformers.set_seed(0)

filepath = epochs[0]
print(filepath)
tokenizer = AutoTokenizer.from_pretrained(filepath)
model = AutoModelForSeq2SeqLM.from_pretrained(filepath)
model = model.to("cuda:0")
print("model loaded")

In [None]:
df_json = train_df.append(test_df).to_dict("records")
for i in range(len(df_json)):
    print(i)
    t5input = df_json[i]['source_text']
    flag = True
    tries = 0
    while flag:
        text_input = t5input
        input_ids = tokenizer.encode(
            text_input, return_tensors="pt", add_special_tokens=True, max_length=512
        )
        generated_ids = model.generate(
            input_ids=input_ids.to("cuda:0"),
            top_k = 25,
            top_p = 0.95,
            do_sample=True,
            num_beams = 1,
            max_length=64,
            num_return_sequences = 30
        )
        t5output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        temp = [t.lower() for t in t5output]
        if len(set(temp)) == 30:
            flag = False
        else:
            print("duplicate t5 output")
            flag = True
            tries += 1
            if tries==1:
                print("Failed to generate unique")
                break
    df_json[i]['generatedT5Misinfo'] = t5output

In [None]:
import json

with open('T5Misinfo.json', 'w') as fp:
    json.dump(df_json, fp)