## Sentence-level online prompty mining: MLQA

In [36]:
import copy
import re
import os, sys
import json
import glob
from collections import Counter, defaultdict
import pandas as pd
import jsonlines

from exploring_sentence_level import (
    load_model,
    mine_prompt_gt,  
    segment_sentence,
    run_online_prompt_mining
)

### 0. Download dataset

```bash
cd ../scripts
bash ./download_mlqa.sh
```

### 1. Process dataset

In [37]:
MLQA_BASE_DIR = '../data/mlqa/MLQA_V1/'

mlqa_xx = {}
MLQA_LANGS = ['en', 'ar', 'de', 'es', 'hi', 'vi', 'zh']
for lang in MLQA_LANGS:
    mlqa_xx[f'{lang}_val'] = json.load(open(os.path.join(MLQA_BASE_DIR, 'dev', f'dev-context-en-question-{lang}.json'), 'r'))['data'],
    mlqa_xx[f'{lang}_test'] = json.load(open(os.path.join(MLQA_BASE_DIR, 'test', f'test-context-en-question-{lang}.json'), 'r'))['data'],
MLQA_LANGS

['en', 'ar', 'de', 'es', 'hi', 'vi', 'zh']

In [38]:
len(mlqa_xx['ar_test'][0])

2389

In [39]:
def get_squad_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        question = qa['question']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((context, question, answer, answer_start))
    return context_qa_pairs

In [40]:
mlqa_xx_dataset = defaultdict(lambda: {'val':[], 'test': []})

mlqa_sentences = defaultdict(lambda: [])


for lang in MLQA_LANGS:
    global_paragraph_id = 0
    global_sentence_id = 0
    for split_name in ['val', 'test']:
        for i, item in enumerate(mlqa_xx[f'{lang}_{split_name}'][0]):

            title = item['title']
            paragraphs = item['paragraphs']

            for j, paragraph in enumerate(paragraphs):

                context = paragraph['context']
                context_qa_pairs = get_squad_answer_str(context=context, qas=paragraph['qas'])
                segmented_context = segment_sentence(context)
                segmented_context_ids = []
                
                for sentence_id in range(len(segmented_context)):
                    mlqa_sentences[lang].append((title, global_paragraph_id, global_sentence_id, segmented_context[sentence_id], split_name))
                    segmented_context_ids.append(global_sentence_id)
                    global_sentence_id += 1
                
                for context_qa_pair in context_qa_pairs:
                    context, question, answer, answer_start = context_qa_pair
                    gt_sentence, gt_sentence_idx = mine_prompt_gt(context_qa_pair)
                    gt_sentence_global_idx = segmented_context_ids[gt_sentence_idx]
                    
                    qa_item = {
                         'question': question,
                         'context': context,
                         'segmented_context': segment_sentence(context),
                         'segmented_context_ids': segmented_context_ids,
                         'answer': answer,
                         'answer_start': answer_start,
                         'split_name': split_name,
                         'gt_sentence': gt_sentence,
                         'gt_sentence_idx': gt_sentence_global_idx,

                    }
                    mlqa_xx_dataset[lang][split_name].append(qa_item)
                global_paragraph_id += 1

In [41]:
len(mlqa_xx_dataset['ar']['val']), \
len(mlqa_xx_dataset['ar']['test'])

(517, 5335)

In [42]:
mlqa_xx_dataset['en']['test'][0]

{'question': 'Who analyzed the biopsies?',
 'context': 'In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conser

#### Write (All language) segmented sentences into separated csv file:


In [43]:
list(mlqa_sentences.keys()), len(mlqa_sentences), len(mlqa_sentences['ar'])

(['en', 'ar', 'de', 'es', 'hi', 'vi', 'zh'], 7, 34917)

In [44]:
for lang in list(mlqa_sentences.keys()):
    if len(mlqa_sentences[lang]) > 0:
        mlqa_sentences_df = pd.DataFrame.from_dict(mlqa_sentences[lang])
        mlqa_sentences_df.columns=['doc_title', 'paragraph_id', 'sentence_id', 'sentence', 'split_name']

        mlqa_sentences_df.to_csv(f'./question-sentences-pairs/mlqa/mlqa_sentence-en_for-question-{lang}.csv')

#### Write (All languages) question-sentence pairs into separated csv file:



In [45]:
mlqa_xx_dataset.keys(), mlqa_xx_dataset['zh'].keys()

(dict_keys(['en', 'ar', 'de', 'es', 'hi', 'vi', 'zh']),
 dict_keys(['val', 'test']))

In [46]:
mlqa_xx_dataset['zh']['val'][0]

{'question': '这项工作最初的计划名称是什么？',
 'context': 'Poe planned for years to produce his own journal The Penn (later renamed The Stylus), but he died before it could be produced. He died in Baltimore on October 7, 1849, at age 40; the cause of his death is unknown and has been variously attributed to alcohol, "brain congestion", cholera, drugs, heart disease, rabies, suicide, tuberculosis, and other causes.Poe and his works influenced literature around the world, as well as specialized fields such as cosmology and cryptography. He and his work appear throughout popular culture in literature, music, films, and television. A number of his homes are dedicated museums today. The Mystery Writers of America present an annual award known as the Edgar Award for distinguished work in the mystery genre.',
 'segmented_context': ['Poe planned for years to produce his own journal The Penn (later renamed The Stylus), but he died before it could be produced.',
  'He died in Baltimore on October 7, 1849, at 

In [47]:
for question_lang in list(mlqa_xx_dataset.keys()):
    
    for split_name in list(mlqa_xx_dataset[question_lang].keys()):

        if len(mlqa_xx_dataset[question_lang][split_name]) > 0:
            mlqa_question_sentence_pairs_df = pd.DataFrame.from_dict(list(map(lambda x: (x['question'], x['gt_sentence_idx']), mlqa_xx_dataset[question_lang][split_name])))
            mlqa_question_sentence_pairs_df.columns = ['question' , 'gt_sentence_idx']

            mlqa_question_sentence_pairs_df.to_csv(f'./question-sentences-pairs/mlqa/mlqa-{split_name}_question-{question_lang}_sentence-en.csv')

### 2. Compute question-sentence similarity


#### 2.1 Load models

##### a) Load mUSE_small (v3) model (as a baseline)

In [8]:
muse_small_v3_model = load_model('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')


##### b) Load teacher models

In [9]:
XQUAD_TEACHER_DIR = '../../../../CL-ReLKT_store/models/XQUAD/teacher_model/'
MLQA_TEACHER_DIR = '../../../../CL-ReLKT_store/models/MLQA/teacher_model/'

In [10]:
xquad_teacher_model = load_model(XQUAD_TEACHER_DIR)
mlqa_teacher_model = load_model(MLQA_TEACHER_DIR)

##### c) Load student models

In [11]:
XQUAD_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XQUAD/student_best_supported_languages/'
XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XQUAD/student_best_unsupported_languages/'

XORQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XORQA/student_best_supported_languages/'
XORQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XORQA/student_best_unsupported_languages/'

MLQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/MLQA/student_best_supported_languages/'
MLQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/MLQA/student_best_unsupported_languages/'

In [None]:
xquad_student_supported_langs_model = load_model(XQUAD_STUDENT_SUPPORTED_LANGS_DIR)
xorqa_student_supported_langs_model = load_model(XORQA_STUDENT_SUPPORTED_LANGS_DIR)
mlqa_student_supported_langs_model = load_model(MLQA_STUDENT_SUPPORTED_LANGS_DIR)

xquad_student_unsupported_langs_model = load_model(XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR)
xorqa_student_unsupported_langs_model = load_model(XORQA_STUDENT_UNSUPPORTED_LANGS_DIR)
mlqa_student_unsupported_langs_model = load_model(MLQA_STUDENT_UNSUPPORTED_LANGS_DIR)

In [None]:
MODEL_MAPPING = {
  # mUSE_small
  'model-muse_small_v3': muse_small_v3_model,
  # teacher    
  'model-xquad_teacher': xquad_teacher_model,
  'model-mlqa_teacher': mlqa_teacher_model,
  # student
  'model-xquad_student_supported_langs': xquad_student_supported_langs_model,
  'model-xorqa_student_supported_langs': xorqa_student_supported_langs_model,
  'model-mlqa_student_supported_langs': mlqa_student_supported_langs_model,
  'model-xquad_student_unsupported_langs': xquad_student_unsupported_langs_model,
  'model-xorqa_student_unsupported_langs': xorqa_student_unsupported_langs_model,
  'model-mlqa_student_unsupported_langs': mlqa_student_unsupported_langs_model,
}



In [None]:
DATASET_MAPPING = {}
for lang in list(MLQA_LANGS):
    DATASET_MAPPING[f'dataset-mlqa_{lang.strip()}_val'] = mlqa_xx_dataset[lang]['val']
    DATASET_MAPPING[f'dataset-mlqa_{lang.strip()}_test'] = mlqa_xx_dataset[lang]['test']
    
print(DATASET_MAPPING.keys())

#### 2.2 Run inference and evaluate

The following function `run_online_prompt_mining` iterates over question-answer-passage triplets $(q_i, a_i, p_i)$ and compute 
the cosine similarity scores between question $q_i$ and segmented setences $s^i_j \textrm{ where } p_i = ( s^i_0, \ldots , s^i_{|p_i| - 1} )$ , and rank each quesiton-sentence pair by similairy score. Then, it evaluate the sentence-level precision@k.  Note: There is only 1 groundtruth sentence (i.e. the sentence where the answer span is a part of). 


In [None]:
results = defaultdict(lambda : defaultdict())

for dataset_prefix, dataset in DATASET_MAPPING.items():
    print(f'\n\ndataset_prefix: {dataset_prefix}')
    for model_prefix, model in MODEL_MAPPING.items():
        
        print(f'\n - model_prefix: {model_prefix}')
        prefix = f'{dataset_prefix}+{model_prefix}'
        _result = run_online_prompt_mining(dataset,
                             prefix=f'{dataset_prefix}_{model_prefix}',
                             model=model)


        results[dataset_prefix][model_prefix] = _result
        print('--'*50)
    print('\n')    
    print('=='*50)
    print('\n')    




dataset_prefix: dataset-mlqa_en_val

 - model_prefix: model-muse_small_v3


100%|██████████| 1148/1148 [02:24<00:00,  7.97it/s]



	Evaluation result:
	 - Accuracy: 0.7125
	 - precision_at_k:
{1: 0.7125435540069687,
 2: 0.8614982578397212,
 3: 0.9242160278745645,
 4: 0.9512195121951219,
 5: 0.9634146341463414,
 6: 0.9747386759581882,
 7: 0.9825783972125436,
 8: 0.9878048780487805,
 9: 0.9895470383275261,
 10: 0.9912891986062717}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_teacher


100%|██████████| 1148/1148 [02:05<00:00,  9.18it/s]



	Evaluation result:
	 - Accuracy: 0.7247
	 - precision_at_k:
{1: 0.7247386759581882,
 2: 0.8641114982578397,
 3: 0.9259581881533101,
 4: 0.9494773519163763,
 5: 0.9651567944250871,
 6: 0.975609756097561,
 7: 0.985191637630662,
 8: 0.9878048780487805,
 9: 0.9895470383275261,
 10: 0.9921602787456446}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|██████████| 1148/1148 [02:21<00:00,  8.11it/s]



	Evaluation result:
	 - Accuracy: 0.7265
	 - precision_at_k:
{1: 0.7264808362369338,
 2: 0.8693379790940766,
 3: 0.9233449477351916,
 4: 0.9494773519163763,
 5: 0.9651567944250871,
 6: 0.9747386759581882,
 7: 0.9817073170731707,
 8: 0.990418118466899,
 9: 0.9912891986062717,
 10: 0.9930313588850174}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_student_supported_langs


100%|██████████| 1148/1148 [02:22<00:00,  8.03it/s]



	Evaluation result:
	 - Accuracy: 0.5166
	 - precision_at_k:
{1: 0.5165505226480837,
 2: 0.7125435540069687,
 3: 0.823170731707317,
 4: 0.8797909407665505,
 5: 0.9172473867595818,
 6: 0.9416376306620209,
 7: 0.9581881533101045,
 8: 0.9686411149825784,
 9: 0.9747386759581882,
 10: 0.980836236933798}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xorqa_student_supported_langs


100%|██████████| 1148/1148 [02:19<00:00,  8.24it/s]



	Evaluation result:
	 - Accuracy: 0.6916
	 - precision_at_k:
{1: 0.6916376306620209,
 2: 0.8336236933797909,
 3: 0.9033101045296167,
 4: 0.9346689895470384,
 5: 0.9512195121951219,
 6: 0.9686411149825784,
 7: 0.9799651567944251,
 8: 0.9843205574912892,
 9: 0.9895470383275261,
 10: 0.9921602787456446}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_student_supported_langs


100%|██████████| 1148/1148 [02:22<00:00,  8.06it/s]



	Evaluation result:
	 - Accuracy: 0.7265
	 - precision_at_k:
{1: 0.7264808362369338,
 2: 0.8667247386759582,
 3: 0.921602787456446,
 4: 0.9477351916376306,
 5: 0.9634146341463414,
 6: 0.9738675958188153,
 7: 0.9825783972125436,
 8: 0.990418118466899,
 9: 0.990418118466899,
 10: 0.9930313588850174}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_student_unsupported_langs


100%|██████████| 1148/1148 [02:26<00:00,  7.84it/s]



	Evaluation result:
	 - Accuracy: 0.4007
	 - precision_at_k:
{1: 0.40069686411149824,
 2: 0.5984320557491289,
 3: 0.7412891986062717,
 4: 0.8310104529616724,
 5: 0.8806620209059234,
 6: 0.9111498257839721,
 7: 0.936411149825784,
 8: 0.9503484320557491,
 9: 0.9625435540069687,
 10: 0.9686411149825784}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xorqa_student_unsupported_langs


100%|██████████| 1148/1148 [02:23<00:00,  8.02it/s]



	Evaluation result:
	 - Accuracy: 0.6838
	 - precision_at_k:
{1: 0.6837979094076655,
 2: 0.8153310104529616,
 3: 0.89198606271777,
 4: 0.936411149825784,
 5: 0.9529616724738676,
 6: 0.9660278745644599,
 7: 0.980836236933798,
 8: 0.9843205574912892,
 9: 0.9886759581881533,
 10: 0.990418118466899}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_student_unsupported_langs


100%|██████████| 1148/1148 [02:23<00:00,  8.02it/s]



	Evaluation result:
	 - Accuracy: 0.7073
	 - precision_at_k:
{1: 0.7073170731707317,
 2: 0.8466898954703833,
 3: 0.912020905923345,
 4: 0.9442508710801394,
 5: 0.9590592334494773,
 6: 0.9703832752613241,
 7: 0.980836236933798,
 8: 0.9869337979094077,
 9: 0.9886759581881533,
 10: 0.9921602787456446}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_en_test

 - model_prefix: model-muse_small_v3


100%|██████████| 11590/11590 [23:08<00:00,  8.35it/s]



	Evaluation result:
	 - Accuracy: 0.6963
	 - precision_at_k:
{1: 0.6962899050905953,
 2: 0.844521138912856,
 3: 0.911130284728214,
 4: 0.9438308886971527,
 5: 0.9614322691975842,
 6: 0.9720448662640208,
 7: 0.9811044003451251,
 8: 0.9855910267471959,
 9: 0.9883520276100086,
 10: 0.9907679033649698}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_teacher


100%|██████████| 11590/11590 [23:19<00:00,  8.28it/s]



	Evaluation result:
	 - Accuracy: 0.7035
	 - precision_at_k:
{1: 0.7034512510785159,
 2: 0.8440897325280414,
 3: 0.9097497842968076,
 4: 0.9440034512510785,
 5: 0.962381363244176,
 6: 0.9715271786022434,
 7: 0.9805004314063848,
 8: 0.9858498705780846,
 9: 0.9885245901639345,
 10: 0.991458153580673}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|██████████| 11590/11590 [23:30<00:00,  8.21it/s]



	Evaluation result:
	 - Accuracy: 0.7112
	 - precision_at_k:
{1: 0.7112165660051769,
 2: 0.8484037963761863,
 3: 0.9101811906816221,
 4: 0.9446937014667817,
 5: 0.9602243313201035,
 6: 0.9715271786022434,
 7: 0.980327868852459,
 8: 0.9855910267471959,
 9: 0.9891285591026747,
 10: 0.9904227782571182}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_student_supported_langs


  6%|▌         | 710/11590 [01:26<23:32,  7.70it/s]

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


#### 2.3 Write result as JSON file

In [None]:
json.dump(results, open('./eval_results.dataset_name-mlqa.json', 'w'), ensure_ascii=False, indent=2)

### 3. Convert evaluation results to a pandas.DataFrame

In [None]:
results = json.load(open('./eval_results.dataset_name-mlqa.json', 'r'))

In [None]:
list(results.keys()), len(list(results.keys()))

In [None]:
result_objs = []
for dataset_name, result_model_group in results.items():
    for model_name, (metric, raw_result) in result_model_group.items():
        top1, precision_at_k = metric
        
        result_objs.append({
            'dataset_name': dataset_name,
            'model_name': model_name,
            'precision_at_1': top1,
            'precision_at_2': precision_at_k['2'],
            'precision_at_3': precision_at_k['6'],
            'precision_at_4': precision_at_k['4'],
            'precision_at_5': precision_at_k['5'],
            'precision_at_10': precision_at_k['10'],
        })
    
df = pd.DataFrame.from_dict(result_objs)
df.to_csv('./eval_results.dataset_name-mlqa.csv')

In [None]:
df