# libraries

In [1]:
import re
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
from datasets import ClassLabel, Sequence
import random
from IPython.display import display, HTML
from tqdm import tqdm

In [2]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# load data

In [3]:
TRAIN_PATH = 'data/train.txt'
DEV_PATH = 'data/val.txt'
TEST_PATH = 'data/test.txt'

In [4]:
def read_data_from_txt(path):
    QandA = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            #print(line)
            if line != "\n":
                splitted = line.split("|||")
                sentences = splitted[0]
                question  = splitted[1]
                answer    = re.sub("\n","",splitted[2])
                answer = r" ".join(answer.split())
                QandA.append((sentences, question, answer))
    return QandA

In [5]:
train_data = read_data_from_txt(TRAIN_PATH)
del train_data[51641] #51641報錯
valid_data = read_data_from_txt(DEV_PATH)
test_data  = read_data_from_txt(TEST_PATH)

99820it [00:01, 54500.25it/s]
13893it [00:00, 50494.85it/s]
27248it [00:00, 57604.32it/s]


# check answer 是否在文章內

In [6]:
def check_answer(data):
    index = []
    for sentences, question, answer in tqdm(data):
        ans = "FALSE"
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        for sentence in sentences:
            try:
                if answer in sentence:
                    ans = "TRUE" 
            except:
                continue
        if ans == "FALSE":
                index.append(i)
    if len(index) == 0:
        print("Every answer is in context.")
check_answer(train_data)
check_answer(valid_data)

100%|███████████████████████████████████| 99819/99819 [00:17<00:00, 5568.59it/s]


Every answer is in context.


100%|███████████████████████████████████| 13893/13893 [00:02<00:00, 5866.79it/s]

Every answer is in context.





# 用tf-idf 選取和問題最相近的句子，包含答案

In [7]:
def get_top_k_articles(query, docs, k=2):

    # Initialize a vectorizer that removes English stop words
    vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

    # Create a corpus of query and documents and convert to TFIDF vectors
    query_and_docs = [query] + docs
    matrix = vectorizer.fit_transform(query_and_docs)

    # Holds our cosine similarity scores
    scores = []

    # The first vector is our query text, so compute the similarity of our query against all document vectors
    for i in range(1, len(query_and_docs)):
        scores.append(cosine_similarity(matrix[0], matrix[i])[0][0])

    # Sort list of scores and return the top k highest scoring documents
    sorted_list = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    top_doc_indices = [x[0] for x in sorted_list[:k]]
    top_docs = [docs[x] for x in top_doc_indices]
  
    return top_docs

In [8]:
# test example
for sentences, question, answer in train_data[0:1]:
    contained_answer_sentence = []
    sentences = re.findall(r'<s>(.*?)</s>', sentences)
    for sentence in sentences:
        if answer in sentence:
            contained_answer_sentence.append(sentence)
    doc = get_top_k_articles(question, contained_answer_sentence)
    for i in doc:
        print(bcolors.HEADER+i+bcolors.ENDC)
    print(bcolors.BOLD+question+bcolors.BOLD)
    print(bcolors.OKBLUE+answer+bcolors.UNDERLINE)
    print()

[95m lebowski sweater replica jun 17 , 2013 history last 8 years life , galileo house arrest espousing man 's theory copernicus espn 's top 10 time [0m
[95m images knickerless women jun 17 , 2013 history last 8 years life , galileo house arrest espousing man 's theory copernicus espn 's top 10 time [0m
[1m last 8 years life , galileo house arrest espousing man 's theory [1m
[94mcopernicus[4m



In [9]:
def get_topk_tfidf_sentence(data):
    contexts = []
    for sentences, question, answer in tqdm(data):
        contained_answer_sentence = []
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        for sentence in sentences:
            if answer in sentence:
                contained_answer_sentence.append(sentence)
        doc = get_top_k_articles(question, contained_answer_sentence)
        context = ". ".join(doc)
        contexts.append(context)
    return contexts
train_contexts = get_topk_tfidf_sentence(train_data)
val_contexts =   get_topk_tfidf_sentence(valid_data)
train_contexts[0]

100%|█████████████████████████████████████| 99819/99819 [18:43<00:00, 88.84it/s]
100%|████████████████████████████████████| 13893/13893 [02:11<00:00, 105.50it/s]


" lebowski sweater replica jun 17 , 2013 history last 8 years life , galileo house arrest espousing man 's theory copernicus espn 's top 10 time .  images knickerless women jun 17 , 2013 history last 8 years life , galileo house arrest espousing man 's theory copernicus espn 's top 10 time "

In [10]:
def get_topk_tfidf_sentence_test(data):
    contexts = []
    for sentences, question, answer in tqdm(data):
        contained_answer_sentence = []
        sentences = re.findall(r'<s>(.*?)</s>', sentences)
        for sentence in sentences:
            contained_answer_sentence.append(sentence)
        doc = get_top_k_articles(question, contained_answer_sentence)
        context = ". ".join(doc)
        contexts.append(context)
    return contexts
test_contexts = get_topk_tfidf_sentence_test(test_data)

100%|█████████████████████████████████████| 27248/27248 [11:18<00:00, 40.13it/s]


In [11]:
print(len(train_contexts))
print(len(train_data))
print(len(val_contexts))
print(len(valid_data))
print(len(test_contexts))
print(len(test_data))

99819
99819
13893
13893
27248
27248


# 轉換成Dataframe

In [12]:
def convert_to_dataframe(data, contexts):
    ques = []
    ans= []
    for sentences, question, answer in data:
        ques.append(question)
        ans.append(answer)
    return pd.DataFrame({'sentences':contexts,'question':ques,'answer':ans})
train_data = convert_to_dataframe(train_data, train_contexts)
valid_data = convert_to_dataframe(valid_data, val_contexts)
test_data =  convert_to_dataframe(test_data, test_contexts)

In [13]:
def show_random_elements(dataset, num_examples=1):
    df = dataset.sample(n = num_examples)
    display(HTML(df.to_html()))
    
show_random_elements(train_data)

Unnamed: 0,sentences,question,answer
43366,"nanjing built tomb emperor hongwu , first emperor ming dynasty , 1383 nanjing museum famous biggest collection china . ming tombs wikipedia ming tombs collection mausoleums built emperors ming dynasty china first ming emperor 's tomb located near capital nanjing xiaoling tomb first ming emperor , hongwu emperor , located near capital nanjing second emperor , jianwen emperor ,","tomb hongwu , first emperor famous dynasty , nanjing , china",ming


# tokenized data

In [14]:
from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

2022-11-14 17:15:21.717978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-14 17:15:21.913237: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-14 17:15:22.451791: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib64:
2022-11-14 17:15:22.451889: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: l

In [15]:
def get_tokennize_list(data):
    data_question = data['question'].tolist()
    data_context = data['sentences'].tolist()
    return data_question, data_context

In [16]:
train_question, train_context = get_tokennize_list(train_data)
valid_question, valid_context = get_tokennize_list(valid_data)
test_question,  test_context  = get_tokennize_list(test_data)

In [17]:
train_encodings = tokenizer(train_question, train_context, truncation=True, padding=True)
val_encodings   = tokenizer(valid_question, valid_context, truncation=True, padding=True)
test_encodings  = tokenizer(test_question, test_context, truncation=True, padding=True)

In [18]:
tokenizer.decode(train_encodings['input_ids'][0])

"[CLS] last 8 years life, galileo house arrest espousing man's theory [SEP] lebowski sweater replica jun 17, 2013 history last 8 years life, galileo house arrest espousing man's theory copernicus espn's top 10 time. images knickerless women jun 17, 2013 history last 8 years life, galileo house arrest espousing man's theory copernicus espn's top 10 time [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

# 找答案的start end index

In [19]:
def get_Start_End_index(data):
    data['start'] = [y.index(x) for x,y in zip(data["answer"],data["sentences"])]
    data['end']   = [x+len(str(y)) for x,y in zip(data["start"],data["answer"])]
get_Start_End_index(train_data)
get_Start_End_index(valid_data)

In [20]:
def set_test_index(data):
    data['start'] = 0
    data['end']   = 0
set_test_index(test_data)

In [21]:
train_data.head()

Unnamed: 0,sentences,question,answer,start,end
0,"lebowski sweater replica jun 17 , 2013 histor...","last 8 years life , galileo house arrest espo...",copernicus,113,123
1,"jim thorpe bio , stats , results olympics spo...",2 1912 olympian football star carlisle indian...,jim thorpe,1,11
2,"yuma arizona oddities part 3 jul 21 , 2011 yu...","city yuma state record average 4 , 055 hours ...",arizona,6,13
3,gravestone famous ancestor roger sherman sign...,"signer dec indep , framer constitution mass ,...",john adams,176,186
4,"2014 new york international fringe festival ,...","title aesop fable , insect shared billing gra...",ant,257,260


In [22]:
train_data['sentences'].iloc[0][113:123]

'copernicus'

# add token positions

In [23]:
train_answer = train_data[['start', 'end']].to_dict('records')
valid_answer = valid_data[['start', 'end']].to_dict('records')
test_answer  = test_data[['start', 'end']].to_dict('records')

In [24]:
i = 0
print(train_data['answer'].iloc[i])
a = train_encodings.char_to_token(i, train_answer[i]['start'], 1)
print(a)
b = train_encodings.char_to_token(i, train_answer[i]['end']-1, 1)
print(b)
print(tokenizer.decode(train_encodings['input_ids'][i][47:b+1]))

copernicus
47
49
copernicus


In [25]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['start'],1))
        end_positions.append(encodings.char_to_token(i, answers[i]['end']-1,1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
            
        shift = 1
        while end_positions[-1] is None:
            if answers[i]['end'] - shift>=0:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['end'] - shift, 1)
                shift += 1 
            else:
                break
            
    encodings.update({'start': start_positions, 'end': end_positions})

add_token_positions(train_encodings, train_answer)
add_token_positions(val_encodings, valid_answer)

In [26]:
# Convert char_based_id to token_based_id
# Find the corossponding token id after input being tokenized
add_token_positions(train_encodings, train_answer)
add_token_positions(val_encodings, valid_answer)
#add_token_positions(test_encodings, test_answer, test_data)

In [27]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start', 'end'])

In [28]:
print(train_encodings['start'][0])
print(train_encodings['end'][0])
print(val_encodings['start'][0])
print(val_encodings['end'][0])

47
49
36
38


In [29]:
i=23
print(train_data['sentences'].iloc[i][train_answer[i]['start']:train_answer[i]['end']])
print(tokenizer.decode(train_encodings['input_ids'][i][train_encodings['start'][i]:train_encodings['end'][i]+1]))

oratory
oratory


# convert to dataset

In [30]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        try:
            return {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        except:
            print(idx)
            
    def __len__(self):
        return len(self.encodings.input_ids)

In [31]:
train_dataset = QADataset(train_encodings)
val_dataset   = QADataset(val_encodings)
test_dataset  = QADataset(test_encodings)

In [32]:
next(iter(val_dataset))

{'input_ids': tensor([  101, 22620,  1394,  3003,   117,  1724,  4249,  2247,   117,  1642,
          6048, 12200, 22217,  1116,   102,  1714,  6746, 18464,  5752,  2025,
          8419,  2158, 22620,  1394,  3003,   117,  1724,  4249,  2247,   117,
          1642,  6048, 12200, 22217,  1116,   117, 23123,  1158,  2787,  6394,
           119,  1185,  2707, 10615,  1489,   117,  1369,  1270,  7761,  1821,
         26237,  1389,  5752, 22620,  1394,  3003,   117,  1724,  4249,  2247,
           117,  1642,  6048, 12200, 22217,  1116, 23123,  1158,  2787,  6394,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

# Model

In [33]:
from transformers import BertModel

class QAModel(torch.nn.Module):

    def __init__(self):

        super(QAModel, self).__init__()

        self.bert = BertModel.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
        self.fc = torch.nn.Linear(384, 2)
        

    def forward(self, input_ids, attention_mask, token_type_ids):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
        logits = output[0]
        out = self.fc(logits)

        return out

In [36]:
from transformers import AdamW
from tqdm import tqdm

# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
model = QAModel().to(device)

optim = torch.optim.AdamW(model.parameters(), lr=1e-4)

# training

In [37]:
# Pack data into dataloader by batch
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [38]:
print(len(train_loader))
print(len(valid_loader))
print(len(test_loader))

6239
869
1703


In [39]:
training_epoch = 10
loss_fct = CrossEntropyLoss()

In [40]:
def evaluate(valid_loader):
    model.eval()
    running_loss = 0.0

    with torch.no_grad():
        loop = tqdm(valid_loader, leave=True)
        for batch_id, batch in enumerate(loop):
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start          = batch['start'].to(device)
            end            = batch['end'].to(device)

            # model output
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            start_logits, end_logits = torch.split(outputs, 1, 2)

            start_logits = start_logits.squeeze(-1).contiguous()
            end_logits   = end_logits.squeeze(-1).contiguous()

            start_loss = loss_fct(start_logits, start)
            end_loss   = loss_fct(end_logits, end)

            loss = start_loss + end_loss
            running_loss += loss.item()
            
        print('Validation Loss {:.4f}'.format(running_loss / len(valid_loader)))

In [None]:
for epoch in range(training_epoch):
    model.train()
    running_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()

        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start          = batch['start'].to(device)
        end            = batch['end'].to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        start_logits, end_logits = torch.split(outputs, 1, 2)

        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits   = end_logits.squeeze(-1).contiguous()

        start_loss = loss_fct(start_logits, start)
        end_loss = loss_fct(end_logits, end)
        loss = start_loss + end_loss
        # calculate loss
        loss.backward()
        # update parameters
        optim.step()
        running_loss += loss.item()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    print('Training Loss {:.4f}'.format(running_loss / len(train_loader)))
    evaluate(valid_loader)
    
torch.save(model.state_dict(),"model/" + 'QA_model_v1')

Epoch 0:  95%|██████████████████ | 5934/6239 [13:23<00:42,  7.14it/s, loss=1.49]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 1:  91%|████████████████▍ | 5691/6239 [13:14<01:16,  7.12it/s, loss=0.972]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 2:  72%|█████████████▌     | 4463/6239 [10:14<04:10,  7.09it/s, loss=2.25]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashin

# evalution

In [46]:
def predict(test_loader):
    
    model.eval()
    
    predict_pos = []
    sub_output = []

    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        start_logits, end_logits = torch.split(outputs, 1, 2)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits   = end_logits.squeeze(-1).contiguous()

        start_prdict = torch.argmax(start_logits, 1).cpu().numpy()
        end_prdict   = torch.argmax(end_logits, 1).cpu().numpy()

        for i in range(len(input_ids)):
            predict_pos.append((start_prdict[i].item(), end_prdict[i].item()))
            sub = tokenizer.decode(input_ids[i][start_prdict[i]:end_prdict[i]+1])
            sub_output.append(sub)
    
    return sub_output, predict_pos

In [48]:
sub_output, predict_pos = predict(valid_loader)

100%|█████████████████████████████████████████| 869/869 [00:33<00:00, 25.56it/s]


In [58]:
sub_output[0]

'hemingway'

In [54]:
def nltk_token_string(sentence):
    # print(sentence)
    tokens = nltk.word_tokenize(sentence)
    for i in range(len(tokens)):
        if len(tokens[i]) == 1:
            tokens[i] = re.sub(r"[!\"#$%&\'()*\+, -.\/:;<=>?@\[\\\]^_`{|}~]", '', tokens[i])
    while '' in tokens:
        tokens.remove('')
    tokens = ' '.join(tokens)
    return tokens

In [55]:
def get_output_post_fn(test, sub_output):
    sub = []
    for i in range(len(test)):

        sub_pred = sub_output[i].split()

        temp = sub_pred.copy()
        if sub_pred is None:
            sub_pred = []
        else:
            for j in range(len(temp)):
                if temp[j] == '[SEP]':
                    sub_pred.remove('[SEP]')
                if temp[j] == '[PAD]':
                    sub_pred.remove('[PAD]')

        sub.append(' '.join(sub_pred))
        
    return sub

In [63]:
sub = get_output_post_fn(valid_data, sub_output)
valid_data['predict'] = sub
valid_data.pop('sub')
valid_data.head(20)

Unnamed: 0,sentences,question,answer,start,end,predict
0,free flashcards authors studystack spain 1959...,"spain 1959 , wrote dangerous summer , story r...",hemingway,101,110,hemingway
1,"california facts , map state symbols enchante...",valley 282 feet sea level state lowest point ...,california,1,11,california
2,price convenience? atm surcharge debate jul 1...,"like banks , many grocery stores dispensing c...",atms,127,131,atms
3,steamboat willy classic cartoons pinterest st...,voice mickey mouse steamboat willie,walt disney,135,146,walt disney
4,eastern europe see section 2 2 6 nation state...,eastern european capital city 2 2 million,bucharest,231,240,bucharest
5,us state longest shoreline?? guide humans ala...,"6 , 640 miles coast , state longest shoreline",alaska,43,49,alaska
6,north dakota pumps 1 million barrels oil day ...,"pumps one million barrels oil day , state",texas,167,172,north dakota
7,"day day npr hear day day program march 20 , 2...",day day things considered among programs goin...,npr,9,12,npr
8,daffy known voice mel blanc 1937 1989 daffy d...,voice daffy duck first 50 years,mel blanc,19,28,mel blanc
9,region 4 russ nelson 's home page glossary ba...,braced framework carrying railroad chasm,trestle,93,100,russ nelson


In [59]:
def lcs(X, Y):
    X_, Y_ = [], []
    
    X_ = nltk_token_string(X)
    Y_ = nltk_token_string(Y)

    m = len(X_)
    n = len(Y_)
 
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
 
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X_[i-1] == Y_[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
 
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]


def acc(full, sub):
    common = lcs(full, sub)
    union = len(full) + len(sub) - common
    accuracy = float(common/union)

    return accuracy

In [60]:
import nltk
nltk.download('punkt')
acc_sum = 0
for i in range(valid_data.shape[0]):
    accuracy = acc(valid_data.iloc[i]["answer"], valid_data.iloc[i]['sub'])
    acc_sum += accuracy

print("accuracy: ", acc_sum/valid_data.shape[0])

[nltk_data] Downloading package punkt to /home/joeyliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


accuracy:  0.7966314514917239


# output for submition

In [64]:
test_sub_output, test_predict_pos = predict(test_loader)

100%|███████████████████████████████████████| 1703/1703 [01:16<00:00, 22.35it/s]


In [70]:
test_sub = get_output_post_fn(test_data, test_sub_output)
test_data['predict'] = test_sub
test_data.tail(20)

Unnamed: 0,sentences,question,answer,start,end,predict
27228,"imperilment ! feb 25 , 2015 1000 guinness say...","guinness says number users language , devised...",answer,0,0,james
27229,shortz ktep relieve duties letter make believ...,letter added amiable gets word means thing,answer,0,0,believe
27230,visual codes secrecy photography death resear...,tongan word something limits also national ge...,answer,0,0,unusual
27231,coincaesarioncoin windows 2016813 ptolemy xv ...,egyptian ruler ptolemy xv aka caesarion,answer,0,0,
27232,"kosher search id com pack , 's certified l'ch...",wo n't find orthodox union trademark lobsters...,answer,0,0,kosher
27233,midnight cowboy 1969 best picture fikkle fame...,1969 buddy pic first x rated film win best pi...,answer,0,0,horses
27234,"bees , wasps , hornets today found apr 14 , 2...",big difference bees close relatives bees feed...,answer,0,0,bumblebee
27235,song 44 sappho revisited 'oral' text song aug...,trojan hero hector share syllable,answer,0,0,greek
27236,"day 3 panmunjom , say americans weecheng com ...",army bases area known 3 letter term panmunjom...,answer,0,0,north korean
27237,kramer vs kramer 1979 best picture fikkle fam...,gore vidal removed name salacious 1979 film m...,answer,0,0,original screenplay


In [71]:
len(test_sub)

27248

In [72]:
test_submit_data  = read_data_from_txt(TEST_PATH)

27248it [00:00, 65252.59it/s]


In [75]:
test_submit_data[0][1]

" 's largest kingdom united kingdom "

In [76]:
text = []
question = []
for i,j,k in test_submit_data:
    text.append(i)
    question.append(j)
print(len(text))
print(len(question))

27248
27248


In [77]:
path = 'data/final-submit.txt'
f = open(path, 'w')
for question, answer in zip(question, test_sub):
    f.write(question+"|||"+answer+"\n")
f.close()