# libraries

In [1]:
import re
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
from datasets import ClassLabel, Sequence
import random
from IPython.display import display, HTML
from tqdm import tqdm
from fastbm25 import fastbm25

2023-01-02 22:07:26.300820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-02 22:07:26.492380: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-02 22:07:27.084675: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib64:
2023-01-02 22:07:27.084746: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: l

In [2]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [3]:
def evaluate_type_BM25(T, k):
    if T:
        K = k
        MODEL_PATH = 'model/QA_modelBM25_k_'+str(K)
    else:
        K = k
        MODEL_PATH = 'model/QA_model_TFIDF_k_'+str(K)
    print(MODEL_PATH)
    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(device)
    return MODEL_PATH, K, device

MODEL_PATH, K, device = evaluate_type_BM25(True, 5)    

model/QA_modelBM25_k_5
cuda:0


# load data

In [4]:
TRAIN_PATH = 'data/train.txt'
DEV_PATH = 'data/val.txt'
TEST_PATH = 'data/test.txt'
TEST_ANSWER_PATH = 'data/Assignment2_test_answer.txt'

In [5]:
def read_data_from_txt(path):
    QandA = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            #print(line)
            if line != "\n":
                splitted = line.split("|||")
                sentences = splitted[0]
                question  = r" ".join(splitted[1].split())
                answer    = re.sub("\n","",splitted[2])
                answer = r" ".join(answer.split())
                QandA.append((sentences, question, answer))
    return QandA

def read_answer_data(path):
    QandA = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            #print(line)
            if line != "\n":
                splitted = line.split("|||")
                question = r" ".join(splitted[0].split())
                answer  = re.sub("\n","",splitted[-1])
                answer = r" ".join(answer.split())
                QandA.append((question, answer))
    return QandA

def correct_test_answer(data, answer):
    QandA = []
    for origin, correct in zip(data, answer):
        sentence = origin[0]
        question = origin[1]
        answer   = correct[1]
        QandA.append((sentence, question, answer))
    return  QandA   

In [6]:
#train_data  = read_data_from_txt(TRAIN_PATH)
#del train_data[51641] #51641報錯
#valid_data  = read_data_from_txt(DEV_PATH)
# test_data
test_data   = read_data_from_txt(TEST_PATH)
test_answer = read_answer_data(TEST_ANSWER_PATH)
test_data   = correct_test_answer(test_data, test_answer)

27248it [00:00, 61761.81it/s]
27248it [00:00, 232305.95it/s]


In [7]:
# Classify data

In [8]:
def classify(read_data_train):
    Result=[]
    count=0
    for i in range(len(read_data_train)):
        sentences = re.findall(r'<s>(.*?)</s>', read_data_train[i][0])
        #print(sentences[0])
        count=0
        #print(len(read_data_train))
        vocabulary = sentences[0].split(" ")
        #print(len(vocabulary))
        #print(vocabulary[5])
        #print(len(read_data_train))
        #print(read_data_train[2][1])

        for j in range(len(vocabulary)):
            if vocabulary[j] != "":
                #print(vocabulary[j], end=" ")
                if vocabulary[j] == read_data_train[i][2]:
                    #print(vocabulary[j] , "  YYY  ", read_data_train[i][2],end=" ")
                    count += 1
        if count >= 1:      # QA問答文本中，含有答案超過XX次以上的list
            Result.append([read_data_train[i][0], read_data_train[i][1], read_data_train[i][2]])

    print(count)
    print(Result)           # 文本中，含有答案超過XX次以上的list
    
classify(test_data)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# 用 TF-IDF or BM25 選取和問題最相近的句子，包含答案

In [9]:
def get_top_k_articles(query, docs, k=1):

    # Initialize a vectorizer that removes English stop words
    vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

    # Create a corpus of query and documents and convert to TFIDF vectors
    query_and_docs = [query] + docs
    matrix = vectorizer.fit_transform(query_and_docs)

    # Holds our cosine similarity scores
    scores = []

    # The first vector is our query text, so compute the similarity of our query against all document vectors
    for i in range(1, len(query_and_docs)):
        scores.append(cosine_similarity(matrix[0], matrix[i])[0][0])

    # Sort list of scores and return the top k highest scoring documents
    sorted_list = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    top_doc_indices = [x[0] for x in sorted_list[:k]]
    top_docs = [docs[x] for x in top_doc_indices]
  
    return top_docs

def get_topk_tfidf_sentence(data):
    contexts = []
    for sentences, question, answer in tqdm(data):
        contained_answer_sentence = []
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        for sentence in sentences:
            if answer in sentence:
                contained_answer_sentence.append(sentence)
        doc = get_top_k_articles(question, contained_answer_sentence)
        context = ". ".join(doc)
        contexts.append(context)
    return contexts

In [10]:
def get_topk_bm25_sentence(data, k):
    contexts = []
    for sentences, question, answer in tqdm(data):
        contained_answer_sentence = []
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        for sentence in sentences:
            if answer in sentence:
                contained_answer_sentence.append(sentence)
        tokenized_corpus = [doc.lower().split(" ") for doc in contained_answer_sentence]
        FASTBM25 = fastbm25(tokenized_corpus)
        tokenized_answer = question.lower().split(" ")
        doc = FASTBM25.top_k_sentence(tokenized_answer, k=k)
        final_doc = []
        for list_doc, _,score in doc:
            final_doc.append(" ".join(list_doc))
        context = ". ".join(final_doc)
        contexts.append(context)
    return contexts

In [11]:
if MODEL_PATH[:-1] == "model/QA_modelBM25_k_":
    print('Method Type: BM25')
    #train_contexts = get_topk_bm25_sentence(train_data, K)
    #valid_contexts = get_topk_bm25_sentence(valid_data, K)
    test_contexts  = get_topk_bm25_sentence(test_data,  K)
elif MODEL_PATH[:-1] == "model/QA_model_TFIDF_k_":
    print('Method Type: TF-IDF')
    #train_contexts = get_topk_tfidf_sentence(train_data)
    #val_contexts =   get_topk_tfidf_sentence(valid_data)
    test_contexts  = get_topk_tfidf_sentence(test_data)

Method Type: BM25


100%|████████████████████████████████████| 27248/27248 [00:32<00:00, 846.26it/s]


# 轉換成Dataframe

In [12]:
def convert_to_dataframe(data, contexts):
    ques = []
    ans= []
    for sentences, question, answer in data:
        ques.append(question)
        ans.append(answer)
    return pd.DataFrame({'sentences':contexts,'question':ques,'answer':ans})

def clean_answer_not_in_context(data):
    index_ids=[]
    for i in range(len(data)):
        if data['answer'].iloc[i] in data['sentences'].iloc[i]:
            continue
        else:
            index_ids.append(i)
    data = data.drop(index_ids, axis=0)
    print(len(data))
    return data

#train_data = convert_to_dataframe(train_data, train_contexts)
#valid_data = convert_to_dataframe(valid_data, valid_contexts)
test_data =  convert_to_dataframe(test_data, test_contexts)

#train_data = clean_answer_not_in_context(train_data)
#valid_data = clean_answer_not_in_context(valid_data)
test_data  = clean_answer_not_in_context(test_data)

27217


# tokenized data

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def get_tokennize_list(data):
    data_question = data['question'].tolist()
    data_context = data['sentences'].tolist()
    return data_question, data_context

#train_question, train_context = get_tokennize_list(train_data)
#valid_question, valid_context = get_tokennize_list(valid_data)
test_question,  test_context  = get_tokennize_list(test_data)

#train_encodings = tokenizer(train_question, train_context, truncation=True, padding=True)
#val_encodings   = tokenizer(valid_question, valid_context, truncation=True, padding=True)
test_encodings  = tokenizer(test_question, test_context, truncation=True, padding=True)

# 找答案的start end index

In [14]:
def get_Start_End_index(data):
    data['start'] = [y.index(x) for x,y in zip(data["answer"],data["sentences"])]
    data['end']   = [x+len(str(y)) for x,y in zip(data["start"],data["answer"])]
#get_Start_End_index(train_data)
#get_Start_End_index(valid_data)
get_Start_End_index(test_data)

# add token positions

In [15]:
#train_answer = train_data[['start', 'end']].to_dict('records')
#valid_answer = valid_data[['start', 'end']].to_dict('records')
test_answer  = test_data[['start', 'end']].to_dict('records')

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['start'],1))
        end_positions.append(encodings.char_to_token(i, answers[i]['end']-1,1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
            
        shift = 1
        while end_positions[-1] is None:
            if answers[i]['end'] - shift>=0:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['end'] - shift, 1)
                shift += 1 
            else:
                break
            
    encodings.update({'start': start_positions, 'end': end_positions})

#add_token_positions(train_encodings, train_answer)
#add_token_positions(val_encodings, valid_answer)
add_token_positions(test_encodings, test_answer)

test_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start', 'end'])

# convert to dataset

In [16]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        try:
            return {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        except:
            print(idx)
            
    def __len__(self):
        return len(self.encodings.input_ids)
    
#train_dataset = QADataset(train_encodings)
#val_dataset   = QADataset(val_encodings)
test_dataset  = QADataset(test_encodings)    

In [17]:
# Pack data into dataloader by batch
batch_size   = 4
#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#valid_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model

In [18]:
from transformers import BertModel

class QAModel(torch.nn.Module):

    def __init__(self):

        super(QAModel, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.fc = torch.nn.Linear(768, 2)
        

    def forward(self, input_ids, attention_mask, token_type_ids):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
        logits = output[0]
        out = self.fc(logits)

        return out

In [19]:
# Put model on device
model = QAModel().to(device)
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


QAModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

# evaluate

In [20]:
def predict(test_loader):
    
    model.eval()
    
    predict_pos = []
    sub_output = []

    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        start_logits, end_logits = torch.split(outputs, 1, 2)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits   = end_logits.squeeze(-1).contiguous()

        start_prdict = torch.argmax(start_logits, 1).cpu().numpy()
        end_prdict   = torch.argmax(end_logits, 1).cpu().numpy()

        for i in range(len(input_ids)):
            predict_pos.append((start_prdict[i].item(), end_prdict[i].item()))
            sub = tokenizer.decode(input_ids[i][start_prdict[i]:end_prdict[i]+1])
            sub_output.append(sub)
    
    return sub_output, predict_pos

def nltk_token_string(sentence):
    # print(sentence)
    tokens = nltk.word_tokenize(sentence)
    for i in range(len(tokens)):
        if len(tokens[i]) == 1:
            tokens[i] = re.sub(r"[!\"#$%&\'()*\+, -.\/:;<=>?@\[\\\]^_`{|}~]", '', tokens[i])
    while '' in tokens:
        tokens.remove('')
    tokens = ' '.join(tokens)
    return tokens

def get_output_post_fn(test, sub_output):
    sub = []
    for i in range(len(test)):

        sub_pred = sub_output[i].split()

        temp = sub_pred.copy()
        if sub_pred is None:
            sub_pred = []
        else:
            for j in range(len(temp)):
                if temp[j] == '[SEP]':
                    sub_pred.remove('[SEP]')
                if temp[j] == '[PAD]':
                    sub_pred.remove('[PAD]')

        sub.append(' '.join(sub_pred))
        
    return sub

In [21]:
sub_output, predict_pos = predict(test_loader)
print(sub_output[0])
sub = get_output_post_fn(test_data, sub_output)
test_data['predict'] = sub
test_data.head(20)

100%|███████████████████████████████████████| 6805/6805 [11:33<00:00,  9.81it/s]

england





Unnamed: 0,sentences,question,answer,start,end,predict
0,"difference united kingdom , great britain , e...",'s largest kingdom united kingdom,england,45,52,england
1,"back rockville review miley cyrus oct 18 , 20...",party u singer also plays young lady named hannah,miley cyrus,23,34,miley cyrus
2,"peach tree fruit britannica com mar 4 , 2015 ...","part peach downy fuzzy , fruit 's called peach...",skin,157,161,peaches
3,cincinnati 5fl14 3 4 x 12'hydraulic shear 168...,4 x 12,48,176,178,cincinnati
4,"grammarphobia blog dribbling , court bib apr ...",verb bouncing basketball sounds like 're slobb...,dribbling,20,29,dribbling
5,chfpatients com heart failure faq ejection fr...,blood pumper,heart,17,22,arteries
6,"download new catalog ati courses space , sate...",sound navigation ranging full name device boun...,sonar,253,258,sonar
7,dart definition dart free dictionary slender ...,"small , slender missile thrown board game",dart,1,5,dart
8,gemstone jewelry glossary terms jewelry telev...,5 letter word hard interior peach,stone,4,9,
9,old fashioned ball games free software sharew...,"kid 's game , bounce small rubber ball picking...",jacks,119,124,jacks


In [25]:
test_data['predict'] = sub
test_data.pop('end')
test_data.head(20)

KeyError: 'start'

In [22]:
import nltk
import re
import collections
import string
nltk.download('punkt')

def lcs(X, Y):
    X_, Y_ = [], []
    
    X_ = nltk_token_string(X)
    Y_ = nltk_token_string(Y)

    m = len(X_)
    n = len(Y_)
 
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
 
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X_[i-1] == Y_[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
 
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]


def acc(full, sub):
    common = lcs(full, sub)
    union = len(full) + len(sub) - common
    accuracy = float(common/union)

    return accuracy

def LCS(data):
    acc_sum = 0
    for i in range(data.shape[0]):
        accuracy = acc(data.iloc[i]["answer"], data.iloc[i]['predict'])
        acc_sum += accuracy
    print("LCS accuracy: ", acc_sum/data.shape[0])
    

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
  
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
  
    def white_space_fix(text):
        return " ".join(text.split())
  
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
  
    def lower(text):
        return text.lower()
  
    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()
   
def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
  
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def f1_metric(data):
    f1_score = 0.0
    for answer, pred in zip(data["answer"], data['predict']):
        f1_score+=compute_f1(answer, pred)
    print("F1 score: {}".format(f1_score/len(data)))
    
def EM_score(data):
    total = len(data)
    calculate = 0
    for i in range(len(data)):
        if data['answer'].iloc[i] == data['predict'].iloc[i]:
            calculate+=1
    print("EM score: {}".format(calculate/total))

[nltk_data] Downloading package punkt to /home/joeyliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
LCS(test_data)
f1_metric(test_data)
EM_score(test_data)

LCS accuracy:  0.8048942179111295
F1 score: 0.7796173606178918
EM score: 0.6975052356982768
