# ensembling model for prediction

In [1]:
from transformers import AutoModelForQuestionAnswering,AutoTokenizer
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor
import torch
import torch.nn.functional as F
import time
from tqdm import tqdm
import numpy as np

In [2]:
squad_data_dir='./SQUAD_V2'
# dev-v2.0.json
processor=SquadV2Processor()
eval_examples=processor.get_dev_examples(squad_data_dir,filename='dev-v2.0.json')
train_examples=processor.get_train_examples(squad_data_dir,filename='train-v2.0.json')
for i,example in enumerate(train_examples):
    print(example.qas_id,example.question_text)
    print(example.answer_text)
    print(example.context_text)
    if i==2:break

100%|██████████| 35/35 [00:13<00:00,  2.58it/s]
100%|██████████| 442/442 [03:02<00:00,  2.43it/s]


56be85543aeaaa14008c9063 When did Beyonce start becoming popular?
in the late 1990s
Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
56be85543aeaaa14008c9065 What areas did Beyonce compete in when she was growing up?
singing and dancing
Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter,

In [3]:
model_dirs=['./mrm8488/bert-medium-finetuned-squadv2','./mrm8488/bert-small-finetuned-squadv2','./mrm8488/bert-mini-5-finetuned-squadv2']
pretrained_model_path1='/home/stu_18701958249/hub_dir/2020-ai-for-nlp-summer/All_tasks_solved_by_transformers/Extractive_Question_Answering/pretrained_model'
models=[AutoModelForQuestionAnswering.from_pretrained(model_dir,return_dict=True) for model_dir in model_dirs]
tokenizers=[AutoTokenizer.from_pretrained(model_dir) for model_dir in model_dirs]

In [4]:
def get_ensembled_result(many_start_probs,many_end_probs,mode='equal'):
    model_nums=len(many_start_probs)
    if mode=='equal':
        ranks=[1.0/model_nums]*model_nums
    elif mode=='ranks':
        ranks=[index/(model_nums*(model_nums+1)*0.5) for index in range(1,model_nums+1)]
    # get the average combined prob from several models
    avg_start_probs,avg_end_probs=0,0
    reference_combined_prob=0.0
    for i in range(model_nums):
        avg_start_probs+=many_start_probs[i]*ranks[i]
        avg_end_probs+=many_end_probs[i]*ranks[i]
        
        combined_matrix=np.matmul(many_start_probs[i][:,np.newaxis],many_end_probs[i][np.newaxis,:])*ranks[i]**2
        max_combined_prob=np.max(combined_matrix)
        if max_combined_prob>reference_combined_prob:
            reference_combined_prob=max_combined_prob
            start,end=np.argwhere(combined_matrix==max_combined_prob)[0]
            model_index,max_answer_start,max_answer_end,max_combined_prob=i,start,end,max_combined_prob       
    # get the average combined prob from several models
    total_combined_avg_result=np.matmul(avg_start_probs[:,np.newaxis],avg_end_probs[np.newaxis,:])
    avg_combined_prob=np.max(total_combined_avg_result)
    answer_start,answer_end=np.argwhere(total_combined_avg_result==avg_combined_prob)[0]
    return model_index,answer_start,answer_end

In [5]:
tokenizers[0].tokenize('I love you')

['i', 'love', 'you']

In [6]:
def get_ensembled_answer(models,tokenizers,question,context,max_sequence_len=384,verbose=False,device='cpu'):
    all_inputs=[tokenizer.encode_plus(question, context, max_length=max_sequence_len, truncation=True, padding='max_length', return_tensors='pt') for tokenizer in tokenizers]
    all_start_probs,all_end_probs=[],[]
    with torch.no_grad():
        for index,model in enumerate(models):
            all_inputs[index].to(device)
            result=model(**all_inputs[index])
            all_start_probs.append(torch.nn.functional.softmax(result.start_logits,dim=1).detach().cpu().numpy()[0])
            all_end_probs.append(torch.nn.functional.softmax(result.end_logits,dim=1).detach().cpu().numpy()[0])
    model_index,answer_start,answer_end=get_ensembled_result(all_start_probs,all_end_probs,mode='equal')
    answer=tokenizers[model_index].decode(all_inputs[model_index]['input_ids'][0,answer_start:answer_end+1],skip_special_tokens=True)
    if verbose:
        print('question:',question)
        print('context:',context)
        print('answer:',answer)
    return answer

In [7]:
predictions={}
trues={}
start_time=time.time()
total_examples=0
correct_examples=0
device='cuda:0' if torch.cuda.is_available() else 'cpu'
for model in models:
    model.to(device)
    model.eval()
for i,example in enumerate(tqdm(train_examples[:10])):
    one_example_start_time=time.time()
    answer=get_ensembled_answer(models,tokenizers,example.question_text,example.context_text,max_sequence_len=384,verbose=False,device=device)
    predictions[example.qas_id]=answer
    trues[example.qas_id]=example.answer_text.lower()
    correct_examples+=int(answer==example.answer_text.lower() or answer in example.answer_text.lower() or example.answer_text.lower() in answer)
    total_examples+=1
    print('it takes {} to predict one example.'.format(time.time()-one_example_start_time))
print('it takes {} to predict 10 examples.'.format(time.time()-start_time))
print(predictions)
print('correct rate:',correct_examples/total_examples)
print(trues)

 10%|█         | 1/10 [00:17<02:41, 17.96s/it]

it takes 17.958550453186035 to predict one example.


 20%|██        | 2/10 [00:35<02:22, 17.83s/it]

it takes 17.537368774414062 to predict one example.


 30%|███       | 3/10 [00:54<02:07, 18.19s/it]

it takes 19.011814832687378 to predict one example.


 40%|████      | 4/10 [01:14<01:52, 18.70s/it]

it takes 19.892311334609985 to predict one example.


 50%|█████     | 5/10 [01:36<01:38, 19.75s/it]

it takes 22.203704357147217 to predict one example.


 60%|██████    | 6/10 [01:55<01:17, 19.45s/it]

it takes 18.738569974899292 to predict one example.


 70%|███████   | 7/10 [02:16<01:00, 20.01s/it]

it takes 21.329211711883545 to predict one example.


 80%|████████  | 8/10 [02:37<00:40, 20.15s/it]

it takes 20.48137331008911 to predict one example.


 90%|█████████ | 9/10 [02:58<00:20, 20.51s/it]

it takes 21.33862590789795 to predict one example.


100%|██████████| 10/10 [03:28<00:00, 20.82s/it]

it takes 29.713159799575806 to predict one example.
it takes 208.26357555389404 to predict 10 examples.
{'56be85543aeaaa14008c9063': 'late 1990s', '56be85543aeaaa14008c9065': 'singing and dancing', '56be85543aeaaa14008c9066': '2003', '56bf6b0f3aeaaa14008c9601': 'houston, texas', '56bf6b0f3aeaaa14008c9602': '1990s', '56bf6b0f3aeaaa14008c9603': "destiny's child", '56bf6b0f3aeaaa14008c9604': 'dangerously in love', '56bf6b0f3aeaaa14008c9605': 'mathew knowles', '56d43c5f2ccc5a1400d830a9': 'late 1990s', '56d43c5f2ccc5a1400d830aa': 'lead singer'}
correct rate: 1.0
{'56be85543aeaaa14008c9063': 'in the late 1990s', '56be85543aeaaa14008c9065': 'singing and dancing', '56be85543aeaaa14008c9066': '2003', '56bf6b0f3aeaaa14008c9601': 'houston, texas', '56bf6b0f3aeaaa14008c9602': 'late 1990s', '56bf6b0f3aeaaa14008c9603': "destiny's child", '56bf6b0f3aeaaa14008c9604': 'dangerously in love', '56bf6b0f3aeaaa14008c9605': 'mathew knowles', '56d43c5f2ccc5a1400d830a9': 'late 1990s', '56d43c5f2ccc5a1400d830aa


