In [31]:
import torch
from transformers import BertTokenizer, BertForPreTraining
from nltk.tokenize import sent_tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [32]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-80GB


In [67]:
#reference for fine tuning BERT: https://towardsdatascience.com/how-to-train-bert-aaad00533168
import json
file_paths = ['data/chq/train.jsonl', 'data/opi/train.jsonl', 'data/d2n/train.jsonl']

inputs_list = []
targets_list = []
for file_path in file_paths:
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if file_path == "data/d2n/train.jsonl":
                preprocessed = data['inputs'].replace('[', '.')
                preprocessed = preprocessed.replace(']', ':')
                inputs_list.append(preprocessed)
            else:
                inputs_list.append(data['inputs'])
            targets_list.append(data['target'])
        
bag = [item for sentence in inputs_list for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [17]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in inputs_list:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        if random.random() >= 0.5:
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)
            

In [18]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs['labels'] = inputs.input_ids.detach().clone()

In [19]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [20]:
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    
dataset = BERTDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=True)

In [21]:
from tqdm import tqdm

optim = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 10
model.cuda()
for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.69] 
Epoch 1: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.432]
Epoch 2: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.277]
Epoch 3: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.224]
Epoch 4: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.216]
Epoch 5: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.113] 
Epoch 6: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.043] 
Epoch 7: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.0629]
Epoch 8: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.0381]
Epoch 9: 100%|██████████| 37/37 [01:25<00:00,  2.30s/it, loss=0.0256]


In [22]:
torch.save(model, "model.pt")

In [33]:
model1 = torch.load("model.pt")

In [70]:
#reference for using BERT for summarization: https://blog.devgenius.io/bert-for-text-summarization-in-python-4c527179cd98
import json
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
model1.cuda()

file_paths = ['data/chq/test.jsonl', 'data/opi/test.jsonl', 'data/d2n/test.jsonl']

inputs_list = []
targets_list = []
for file_path in file_paths:
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if file_path == "data/d2n/train.jsonl":
                preprocessed = data['inputs'].replace('[', '.')
                preprocessed = preprocessed.replace(']', ':')
                inputs_list.append(preprocessed)
            else:
                inputs_list.append(data['inputs'])
            targets_list.append(data['target'])

num = 0
summaries = []
for input in inputs_list:
    sentences = sent_tokenize(input)
    tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=512) for sent in sentences]
    max_len = 0
    for i in tokenized_sentences:
        if len(i) > max_len:
            max_len = len(i)

    padded_sentences = []
    for i in tokenized_sentences:
        while len(i) < max_len:
            i.append(0)
        padded_sentences.append(i)

    input_ids = torch.tensor(padded_sentences).to('cuda')
    with torch.no_grad():
        last_hidden_states = model1(input_ids)[0]

    sentence_embeddings = []
    for i in range(len(sentences)):
        sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).cpu().numpy())
    similarity_matrix = cosine_similarity(sentence_embeddings)

    num_sentences = 2
    summary_sentences = []
    for i in range(min(num_sentences, len(sentences))):
        sentence_scores = list(enumerate(similarity_matrix[i]))
        sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
        if len(sentences) == 1:
            summary_sentences.append(sentences[0])
        else:
            summary_sentences.append(sentences[sentence_scores[1][0]])

    summary = ' '.join(summary_sentences)
    summaries.append(summary)
    num = num+1

[nltk_data] Downloading package punkt to
[nltk_data]     /home/jordan.sabo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [71]:
from bert_score import BERTScorer
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(inputs_list, summaries)
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

BERTScore Precision: 0.6953, Recall: 0.8946, F1: 0.7751


In [74]:
from nltk.translate.bleu_score import sentence_bleu

all_bleu_scores = []
for references, candidates in zip(inputs_list, summaries):
    bleu_score = sentence_bleu([references.split()], candidates.split())
    all_bleu_scores.append(bleu_score)

average_bleu_score = sum(all_bleu_scores) / len(all_bleu_scores)
print(f'Average BLEU Score: {average_bleu_score:.4f}')


Average BLEU Score: 0.3599


In [75]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

all_rouge_scores = []
for references, candidates in zip(inputs_list, summaries):
    scores = scorer.score(references, candidates)
    all_rouge_scores.append(scores["rouge1"])

total_recall = sum(score.recall for score in all_rouge_scores) / len(all_rouge_scores)
total_precision = sum(score.precision for score in all_rouge_scores) / len(all_rouge_scores)
total_fmeasure = sum(score.fmeasure for score in all_rouge_scores) / len(all_rouge_scores)

print(f"Average Recall: {total_recall}")
print(f"Average Precision: {total_precision}")
print(f"Average F4: {total_fmeasure}")

Average Recall: 0.4883396562691781
Average Precision: 0.9600702228898178
Average F4: 0.591478932954273
