In [27]:
# libraries
import re
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
from datasets import ClassLabel, Sequence
import random
from IPython.display import display, HTML
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import EncoderDecoderModel
from fastbm25 import fastbm25
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

In [2]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# load data (context, question, answer)
- train_data
- valid_data
- test_data

In [3]:
TRAIN_PATH = 'data/train.txt'
DEV_PATH = 'data/val.txt'
TEST_PATH = 'data/test.txt'
TEST_ANSWER_PATH = 'data/Assignment2_test_answer.txt'

In [4]:
def read_data_from_txt(path):
    QandA = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            #print(line)
            if line != "\n":
                splitted = line.split("|||")
                sentences = splitted[0]
                question  = r" ".join(splitted[1].split())
                answer    = re.sub("\n","",splitted[2])
                answer = r" ".join(answer.split())
                QandA.append((sentences, question, answer))
    return QandA

def read_answer_data(path):
    QandA = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            #print(line)
            if line != "\n":
                splitted = line.split("|||")
                question = r" ".join(splitted[0].split())
                answer  = re.sub("\n","",splitted[-1])
                answer = r" ".join(answer.split())
                QandA.append((question, answer))
    return QandA

def correct_test_answer(data, answer):
    QandA = []
    for origin, correct in zip(data, answer):
        sentence = origin[0]
        question = origin[1]
        answer   = correct[1]
        QandA.append((sentence, question, answer))
    return  QandA   

In [5]:
train_data  = read_data_from_txt(TRAIN_PATH)
#del train_data[51641] #51641報錯
#valid_data  = read_data_from_txt(DEV_PATH)
# test_data
test_data   = read_data_from_txt(TEST_PATH)
test_answer = read_answer_data(TEST_ANSWER_PATH)
test_data   = correct_test_answer(test_data, test_answer)

99820it [00:01, 64350.52it/s]
27248it [00:00, 62188.18it/s]
27248it [00:00, 233364.23it/s]


In [6]:
test_data = test_data[:100]
len(test_data)

100

# BM25 retrival
- retrieve top 5

In [7]:
# test example

for sentences, question, answer in train_data[121:122]:
    contained_answer_sentence = []
    sentences = re.findall(r'<s>(.*?)</s>', sentences)
    tokenized_corpus = [doc.lower().split(" ") for doc in sentences]
    model = fastbm25(tokenized_corpus)
    answer1 = question+" "+answer
    tokenized_answer = answer1.lower().split(" ")
    doc = model.top_k_sentence(tokenized_answer,k=3)
    final_doc = []
    for list_doc, _,score in doc:
        final_doc.append(" ".join(list_doc))
    context = ". ".join(final_doc)

    print(context)
    print(len(doc[0]))
    for i in final_doc:
        print(bcolors.HEADER+i+bcolors.ENDC)
        print()
    print(bcolors.BOLD+question+bcolors.BOLD)
    print(bcolors.OKBLUE+answer+bcolors.UNDERLINE)
    print()


 lee , robert e ca 18061870 encyclopedia virginia robert e lee confederate general american civil war 1861 1865 led army northern virginia june 1862 repulsed george b mcclellan 's army confederate capital next day , lee determined attack northern forces , despite misgivings .  peninsula campaign encyclopedia virginia george b mcclellan , joseph e johnston , robert e lee general chief george b mcclellan capture confederate capital richmond , seven days' campaign , driving peninsula saving richmond june 25 , 1862 union general george b mcclellan 's forces advance oak grove .  timeline robert e lee american experience wgbh pbs tarnished reputation great father haunt robert e lee rest confederate forces attack fort sumter south carolina ignite civil war mcclellan library congress mcclellan 's officers early june 1862 lee 's tactical decisions antietam save army thousands men , even 
3
[95m lee , robert e ca 18061870 encyclopedia virginia robert e lee confederate general american civil war

In [8]:
def get_topk_bm25_sentence(data, k):
    contexts = []
    for sentences, question, answer in tqdm(data):
        contained_answer_sentence = []
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        tokenized_corpus = [doc.lower().split(" ") for doc in sentences]
        model = fastbm25(tokenized_corpus)
        answer1 = question+" "+answer
        tokenized_answer = answer1.lower().split(" ")
        doc = model.top_k_sentence(tokenized_answer,k=k)
        final_doc = []
        for list_doc, _,score in doc:
            final_doc.append(" ".join(list_doc))
        context = ". ".join(final_doc)
        contexts.append(context)
    return contexts

#train_contexts = get_topk_bm25_sentence(train_data, 5)
#valid_contexts = get_topk_bm25_sentence(valid_data, 5)
test_contexts  = get_topk_bm25_sentence(test_data,  5)

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 310.18it/s]


# convert to dataframe and to dataset

In [9]:
def convert_to_dataframe(data, contexts):
    ques = []
    ans= []
    for sentences, question, answer in data:
        ques.append(question)
        ans.append(answer)
    return pd.DataFrame({'context':contexts,'question':ques,'answer':ans})
#train_data = convert_to_dataframe(train_data, train_contexts)
#valid_data = convert_to_dataframe(valid_data, valid_contexts)
test_data =  convert_to_dataframe(test_data,  test_contexts)

In [10]:
#train_dataset = Dataset(pa.Table.from_pandas(train_data)).select(range(49000))
#valid_dataset = Dataset(pa.Table.from_pandas(valid_data)).select(range(7000))
test_dataset  = Dataset(pa.Table.from_pandas(test_data))

In [11]:
def show_random_elements(dataset, num_examples=1):
    df = dataset.sample(n = num_examples)
    display(HTML(df.to_html()))
    
show_random_elements(test_data)

Unnamed: 0,context,question,answer
18,"chp 6 1 atoms atom 's mass 99 95 contained tiny dot called nucleus situated center atom volume nucleus 1 part . student 4 page 1 pdf , 74kb nzqa nucleus small holds 99 95 atomic mass fission? fission nucleus atom broken smaller parts neutron . atom nuclear core 100 , 000x smaller , 99 95 mass protons atom poster compliments u national science foundation , 99 95 mass protons neutrons nucleus deuteron 1 proton 1 neutron . week 2 chapter 4 atoms , molecules ions type atom different properties , one mass mass nucleus dense , 99 95 mass atom substances discuss next section masses molecules . unit 2 carleton university atomic mass unit amu exactly 1 12 mass common kind resulting heavy iron core produces magnetic field silicates major part neutron , virtually entire mass 99 95 atom lies nucleus",99 95 mass atom part,nucleus


# Tokenized

In [12]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [13]:
encoder_max_length=512
decoder_max_length=128

def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["question"], batch["context"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["answer"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [14]:
# batch_size = 16
batch_size = 1
'''
train_dataset = train_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size
)

valid_dataset = valid_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size
)
'''
test_dataset = test_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["context", "question", "answer"]
)

  0%|          | 0/100 [00:00<?, ?ba/s]

In [15]:
#train_dataset.set_format(type="torch")
#valid_dataset.set_format(type="torch")
test_dataset.set_format(type="torch", columns=[ "input_ids","attention_mask", "labels"])

In [16]:
#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model

In [28]:
bert2bert = EncoderDecoderModel.from_pretrained("bert2bert")

In [18]:
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

In [19]:
bert2bert.config.max_length = 10
bert2bert.config.min_length = 1
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [20]:
# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
optim = torch.optim.AdamW(bert2bert.parameters(), lr=1e-4)
training_epoch = 5
loss_fct = CrossEntropyLoss()

In [29]:
bert2bert.load_state_dict(torch.load('model/QA_model_generate'))
bert2bert = bert2bert.to(device)

In [30]:
param_size = 0
for param in bert2bert.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in bert2bert.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 943.632MB


In [22]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Jan  2 23:18:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   32C    P2    51W / 198W |   1575MiB /  8119MiB |     85%      Default |
|                               |            

# Fine-tuining

In [23]:
'''
def evaluate(valid_loader):
    bert2bert.eval()
    running_loss = 0.0

    with torch.no_grad():
        loop = tqdm(valid_loader, leave=True)
        for batch_id, batch in enumerate(loop):
            input_ids = batch['input_ids'].to(device)
            labels    = batch['labels'].to(device)
            # model output
            output = bert2bert(input_ids=input_ids,labels = labels)
            loss   = output.loss
            running_loss += loss.item()
            
        print('Validation Loss {:.4f}'.format(running_loss / len(valid_loader)))
'''

"\ndef evaluate(valid_loader):\n    bert2bert.eval()\n    running_loss = 0.0\n\n    with torch.no_grad():\n        loop = tqdm(valid_loader, leave=True)\n        for batch_id, batch in enumerate(loop):\n            input_ids = batch['input_ids'].to(device)\n            labels    = batch['labels'].to(device)\n            # model output\n            output = bert2bert(input_ids=input_ids,labels = labels)\n            loss   = output.loss\n            running_loss += loss.item()\n            \n        print('Validation Loss {:.4f}'.format(running_loss / len(valid_loader)))\n"

In [24]:
'''
import warnings
warnings.filterwarnings("ignore")

for epoch in range(training_epoch):
    bert2bert.train()
    running_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels    = batch['labels'].to(device)
        # model output
        output = bert2bert(input_ids=input_ids,labels = labels)
        loss = output.loss
        # calculate loss
        loss.backward()
        # update parameters
        optim.step()
        running_loss += loss.item()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    print('Training Loss {:.4f}'.format(running_loss / len(train_loader)))
    evaluate(valid_loader)
    
torch.save(bert2bert.state_dict(),"model/" + 'QA_model_generate')
'''



# Evaluate

In [25]:
#test_dataset["preddict"] = "none"
#inputs = tokenizer(test_dataset["question"], test_dataset["context"], padding="max_length", truncation=True, max_length=encoder_max_length)
#outputs = bert2bert.generate(inputs.input_ids, attention_mask=inputs.attention_mask)
#output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#test_dataset["preddict"] = output_str

In [26]:
outputs = bert2bert.generate(test_dataset["input_ids"].to(device), attention_mask=test_dataset["attention_mask"].to(device))
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
test_dataset["preddict"] = output_str
test_data['predict'] = test_dataset["preddict"]



RuntimeError: CUDA out of memory. Tried to allocate 600.00 MiB (GPU 0; 7.93 GiB total capacity; 6.24 GiB already allocated; 446.38 MiB free; 6.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!nvidia-smi

In [None]:
import nltk
import re
import collections
import string
nltk.download('punkt')

def lcs(X, Y):
    X_, Y_ = [], []
    
    X_ = nltk_token_string(X)
    Y_ = nltk_token_string(Y)

    m = len(X_)
    n = len(Y_)
 
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
 
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X_[i-1] == Y_[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
 
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]


def acc(full, sub):
    common = lcs(full, sub)
    union = len(full) + len(sub) - common
    accuracy = float(common/union)

    return accuracy

def LCS(data):
    acc_sum = 0
    for i in range(data.shape[0]):
        accuracy = acc(data["answer"].iloc[i], data['predict'].iloc[i])
        acc_sum += accuracy
    print("LCS accuracy: ", acc_sum/data.shape[0])
    

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
  
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
  
    def white_space_fix(text):
        return " ".join(text.split())
  
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
  
    def lower(text):
        return text.lower()
  
    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()
   
def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
  
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def f1_metric(data):
    f1_score = 0.0
    for answer, pred in zip(data["answer"], data['predict']):
        f1_score+=compute_f1(answer, pred)
    print("F1 score: {}".format(f1_score/len(data)))
    
def EM_score(data):
    total = len(data)
    calculate = 0
    for i in range(len(data)):
        if data['answer'].iloc[i] == data['predict'].iloc[i]:
            calculate+=1
    print("EM score: {}".format(calculate/total))

In [None]:
LCS(test_data)
f1_metric(test_data)
EM_score(test_data)