## Sentence-level online prompty mining: XQUAD

In [78]:
import copy
import re
import os, sys
import json
import glob
from collections import Counter, defaultdict
import pandas as pd
import jsonlines

from exploring_sentence_level import (
    load_model,
    mine_prompt_gt,  
    segment_sentence,
    run_online_prompt_mining
)

### 0. Download dataset

```bash
cd ../scripts
bash ./download_xquad_v1.1.sh
```

### 1. Process dataset

In [79]:
XQUAD_BASE_DIR = '../data/xquad/xx/'
xquad_en = json.load(open(os.path.join(XQUAD_BASE_DIR, 'xquad.en.json'), 'r'))
xquad_en.keys(), \
xquad_en['version']


xquad_xx = {}
XQUAD_LANGS = ['ar', 'de', 'el', 'en', 'es', 'hi', 'ro', 'ru', 'th', 'tr', 'vi', 'zh']
for lang in XQUAD_LANGS:
    xquad_xx[f'{lang}'] = json.load(open(os.path.join(XQUAD_BASE_DIR, f'xquad.{lang}.json'), 'r'))['data']


In [80]:
def get_squad_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        question = qa['question']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((context, question, answer, answer_start))
    return context_qa_pairs

In [81]:
item = xquad_en['data'][3]['paragraphs'][0]
context_qa_pairs = get_squad_answer_str(context=item['context'], qas=item['qas'])
context_qa_pairs[0]

('Tesla was renowned for his achievements and showmanship, eventually earning him a reputation in popular culture as an archetypal "mad scientist". His patents earned him a considerable amount of money, much of which was used to finance his own projects with varying degrees of success.:121,154 He lived most of his life in a series of New York hotels, through his retirement. Tesla died on 7 January 1943. His work fell into relative obscurity after his death, but in 1960 the General Conference on Weights and Measures named the SI unit of magnetic flux density the tesla in his honor. There has been a resurgence in popular interest in Tesla since the 1990s.',
 'What year did Tesla die? ',
 '1943',
 399)


```python
class xquad_dataset_item:
    question: str
    context: str
    segmented_context: str
    answer: str
    answer_start: int
    gt_sentence: str
```

In [82]:
xquad_question_counter = Counter()
n_paragraph = len(xquad_en['data'])
xquad_dataset=[]

xquad_sentences = []
global_paragraph_id = 0
global_sentence_id = 0

qid2sent = {}

for i, item in enumerate(xquad_en['data']):

    title = item['title']
    paragraphs = item['paragraphs']
    print('.' ,end='')
    for j, paragraph in enumerate(paragraphs):
        xquad_question_counter[f'd-{i}_p-{j}'] = len(paragraph['qas'])
        
        context = paragraph['context']
        context_qa_pairs = get_squad_answer_str(context=context, qas=paragraph['qas'])
        segmented_context = segment_sentence(context)
        segmented_context_ids = []
        for sentence_id in range(len(segmented_context)):
            xquad_sentences.append((title, global_paragraph_id, global_sentence_id, segmented_context[sentence_id]))
            segmented_context_ids.append(global_sentence_id)
            global_sentence_id += 1

        for k, context_qa_pair in enumerate(context_qa_pairs):
            context, question, answer, answer_start = context_qa_pair
            qas_item = paragraph['qas'][k]
            question_id = qas_item['id']
            gt_sentence, gt_sentence_idx = mine_prompt_gt(context_qa_pair)
            gt_sentence_global_idx = segmented_context_ids[gt_sentence_idx]
            qid2sent[question_id] = {
                'gt_sentence': gt_sentence,
                'gt_sentence_idx': gt_sentence_global_idx
            }
            qa_item = {
                 'question': question,
                 'question_id': question_id,
                 'context': context,
                 'segmented_context': segmented_context,
                 'segmented_context_ids': segmented_context_ids,
                 'answer': answer,
                 'answer_start': answer_start,
                 'gt_sentence': gt_sentence,
                 'gt_sentence_idx': gt_sentence_global_idx,

            }
            xquad_dataset.append(qa_item)
        global_paragraph_id += 1

................................................

In [83]:
# qid2sent

In [85]:
xquad_xx_dataset = defaultdict(lambda: [])

for lang in XQUAD_LANGS:

    for i, item in enumerate(xquad_xx[lang]):

        title = item['title']
        paragraphs = item['paragraphs']
        for j, paragraph in enumerate(paragraphs):
            
            qas_items = paragraph['qas']

            for qas_item in qas_items:
                question = qas_item['question']
                question_id = qas_item['id']
                answer = qas_item['answers'][0]['text']
                answer_start = qas_item['answers'][0]['answer_start']

                gt_sentence_item = qid2sent[question_id]
                gt_sentence, gt_sentence_id = gt_sentence_item['gt_sentence'], gt_sentence_item['gt_sentence_idx']


                qa_item = {
                    'question': question,
                    'question_id': question_id,
                    'answer': answer,
                    'answer_start': answer_start,
                    'gt_sentence': gt_sentence,
                    'gt_sentence_idx': gt_sentence_id,

                }
                xquad_xx_dataset[lang].append(qa_item)
            global_paragraph_id += 1


In [86]:
int(sum(xquad_question_counter.values()))

1190

#### Write (English) segmented sentences into separated csv file:

In [87]:
len(xquad_sentences), xquad_sentences[0]

(1202,
 ('Super_Bowl_50',
  0,
  0,
  'The Panthers defense gave up just 308 points, ranking sixth in the league, while also leading the NFL in interceptions with 24 and boasting four Pro Bowl selections.'))

In [88]:
xquad_en_sentences_df = pd.DataFrame.from_dict(xquad_sentences)
xquad_en_sentences_df.columns=['doc_title', 'paragraph_id', 'sentence_id', 'sentence']

In [89]:
xquad_en_sentences_df.head(15)

Unnamed: 0,doc_title,paragraph_id,sentence_id,sentence
0,Super_Bowl_50,0,0,"The Panthers defense gave up just 308 points, ..."
1,Super_Bowl_50,0,1,Pro Bowl defensive tackle Kawann Short led the...
2,Super_Bowl_50,0,2,Fellow lineman Mario Addison added 6½ sacks.
3,Super_Bowl_50,0,3,The Panthers line also featured veteran defens...
4,Super_Bowl_50,0,4,"Behind them, two of the Panthers three startin..."
5,Super_Bowl_50,0,5,"Davis compiled 5½ sacks, four forced fumbles, ..."
6,Super_Bowl_50,0,6,Carolina's secondary featured Pro Bowl safety ...
7,Super_Bowl_50,1,7,The Broncos defeated the Pittsburgh Steelers i...
8,Super_Bowl_50,1,8,They then beat the defending Super Bowl XLIX c...
9,Super_Bowl_50,1,9,Despite Manning's problems with interceptions ...


In [90]:
xquad_en_sentences_df.to_csv('./question-sentences-pairs/xquad/xquad_sentence-en.csv')

#### Write (All languages) question-sentence pairs into separated csv file:



In [91]:
list(xquad_xx_dataset.keys()), len(xquad_xx_dataset['zh'])

(['ar', 'de', 'el', 'en', 'es', 'hi', 'ro', 'ru', 'th', 'tr', 'vi', 'zh'],
 1190)

In [92]:
for question_lang in list(xquad_xx_dataset.keys()):
    

    if len(xquad_xx_dataset[question_lang]) > 0:
        xquad_question_sentence_pairs_df = pd.DataFrame.from_dict(list(map(lambda x: (x['question'], x['gt_sentence_idx']), xquad_xx_dataset[question_lang])))
        xquad_question_sentence_pairs_df.columns = ['question' , 'gt_sentence_idx']

        xquad_question_sentence_pairs_df.to_csv(f'./question-sentences-pairs/xquad/xquad_question-{question_lang}_sentence-en.csv')
        
        

### 2. Compute question-sentence similarity


#### 2.1 Load models

##### a) Load mUSE_small (v3) model (as a baseline)

In [8]:
muse_small_v3_model = load_model('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')


##### b) Load teacher models

In [9]:
XQUAD_TEACHER_DIR = '../../../../CL-ReLKT_store/models/XQUAD/teacher_model/'
MLQA_TEACHER_DIR = '../../../../CL-ReLKT_store/models/MLQA/teacher_model/'

In [10]:
xquad_teacher_model = load_model(XQUAD_TEACHER_DIR)
mlqa_teacher_model = load_model(MLQA_TEACHER_DIR)

##### c) Load student models

In [11]:
XQUAD_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XQUAD/student_best_supported_languages/'
XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XQUAD/student_best_unsupported_languages/'

XORQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XORQA/student_best_supported_languages/'
XORQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/XORQA/student_best_unsupported_languages/'

MLQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/MLQA/student_best_supported_languages/'
MLQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../../CL-ReLKT_store/models/MLQA/student_best_unsupported_languages/'

In [12]:
xquad_student_supported_langs_model = load_model(XQUAD_STUDENT_SUPPORTED_LANGS_DIR)
xorqa_student_supported_langs_model = load_model(XORQA_STUDENT_SUPPORTED_LANGS_DIR)
mlqa_student_supported_langs_model = load_model(MLQA_STUDENT_SUPPORTED_LANGS_DIR)

xquad_student_unsupported_langs_model = load_model(XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR)
xorqa_student_unsupported_langs_model = load_model(XORQA_STUDENT_UNSUPPORTED_LANGS_DIR)
mlqa_student_unsupported_langs_model = load_model(MLQA_STUDENT_UNSUPPORTED_LANGS_DIR)

In [13]:
MODEL_MAPPING = {
  # mUSE_small
  'model-muse_small_v3': muse_small_v3_model,
  # teacher    
  'model-xquad_teacher': xquad_teacher_model,
  'model-mlqa_teacher': mlqa_teacher_model,
  # student
  'model-xquad_student_supported_langs': xquad_student_supported_langs_model,
  'model-xorqa_student_supported_langs': xorqa_student_supported_langs_model,
  'model-mlqa_student_supported_langs': mlqa_student_supported_langs_model,
  'model-xquad_student_unsupported_langs': xquad_student_unsupported_langs_model,
  'model-xorqa_student_unsupported_langs': xorqa_student_unsupported_langs_model,
  'model-mlqa_student_unsupported_langs': mlqa_student_unsupported_langs_model,
}


In [14]:

DATASET_MAPPING = {
    'dataset-xquad_en_train': xquad_dataset,
}
DATASET_MAPPING.keys()

dict_keys(['dataset-xquad_en_train'])

#### 2.2 Run inference and evaluate

The following function `run_online_prompt_mining` iterates over question-answer-passage triplets $(q_i, a_i, p_i)$ and compute 
the cosine similarity scores between question $q_i$ and segmented setences $s^i_j \textrm{ where } p_i = ( s^i_0, \ldots , s^i_{|p_i| - 1} )$ , and rank each quesiton-sentence pair by similairy score. Then, it evaluate the sentence-level precision@k.  Note: There is only 1 groundtruth sentence (i.e. the sentence where the answer span is a part of). 


In [None]:
results = defaultdict(lambda : defaultdict())

for dataset_prefix, dataset in DATASET_MAPPING.items():
    print(f'\n\ndataset_prefix: {dataset_prefix}')
    for model_prefix, model in MODEL_MAPPING.items():
        
        print(f'\n - model_prefix: {model_prefix}')
        prefix = f'{dataset_prefix}+{model_prefix}'
        _result = run_online_prompt_mining(dataset,
                             prefix=f'{dataset_prefix}_{model_prefix}',
                             model=model)


        results[dataset_prefix][model_prefix] = _result
        print('--'*50)
    print('\n')    
    print('=='*50)
    print('\n')    
    



dataset_prefix: dataset-xquad_en_train

 - model_prefix: model-muse_small_v3


100%|██████████| 1190/1190 [02:31<00:00,  7.84it/s]



	Evaluation result:
	 - Accuracy: 0.7118
	 - precision_at_k:
{1: 0.711764705882353,
 2: 0.8739495798319328,
 3: 0.9436974789915966,
 4: 0.9722689075630252,
 5: 0.9865546218487395,
 6: 0.9907563025210084,
 7: 0.9957983193277311,
 8: 0.9974789915966387,
 9: 0.9974789915966387,
 10: 0.9974789915966387}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_teacher


100%|██████████| 1190/1190 [02:28<00:00,  8.02it/s]



	Evaluation result:
	 - Accuracy: 0.7160
	 - precision_at_k:
{1: 0.7159663865546219,
 2: 0.8789915966386554,
 3: 0.9352941176470588,
 4: 0.9714285714285714,
 5: 0.9882352941176471,
 6: 0.9932773109243698,
 7: 0.9957983193277311,
 8: 0.9966386554621849,
 9: 0.9974789915966387,
 10: 0.9974789915966387}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|██████████| 1190/1190 [02:33<00:00,  7.76it/s]



	Evaluation result:
	 - Accuracy: 0.7319
	 - precision_at_k:
{1: 0.7319327731092437,
 2: 0.8798319327731092,
 3: 0.9411764705882353,
 4: 0.9773109243697479,
 5: 0.9907563025210084,
 6: 0.9932773109243698,
 7: 0.9957983193277311,
 8: 0.9966386554621849,
 9: 0.9983193277310924,
 10: 0.9983193277310924}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-xquad_student_supported_langs


 86%|████████▌ | 1024/1190 [02:10<00:20,  8.27it/s]

In [19]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


#### 2.3 Write result as JSON file

In [20]:
json.dump(results, open('./eval_results.dataset_name-xquad.json', 'w'), ensure_ascii=False, indent=2)

### 3. Convert evaluation results to a pandas.DataFrame

In [21]:
results = json.load(open('./eval_results.dataset_name-xquad.json', 'r'))

In [22]:

list(results.keys()), len(list(results.keys()))

(['dataset-xquad_en_train'], 1)

In [27]:
result_objs = []
for dataset_name, result_model_group in results.items():
    for model_name, (metric, raw_result) in result_model_group.items():
        top1, precision_at_k = metric
        
        result_objs.append({
            'dataset_name': dataset_name,
            'model_name': model_name,
            'precision_at_1': top1,
            'precision_at_2': precision_at_k['2'],
            'precision_at_3': precision_at_k['6'],
            'precision_at_4': precision_at_k['4'],
            'precision_at_5': precision_at_k['5'],
            'precision_at_10': precision_at_k['10'],
        })
    
df = pd.DataFrame.from_dict(result_objs)
df.to_csv('./eval_results.dataset_name-xquad.csv')

In [28]:
df

Unnamed: 0,dataset_name,model_name,precision_at_1,precision_at_2,precision_at_3,precision_at_4,precision_at_5,precision_at_10
0,dataset-xquad_en_train,model-muse_small_v3,0.711765,0.87395,0.990756,0.972269,0.986555,0.997479
1,dataset-xquad_en_train,model-xquad_teacher,0.715966,0.878992,0.993277,0.971429,0.988235,0.997479
2,dataset-xquad_en_train,model-mlqa_teacher,0.731933,0.879832,0.993277,0.977311,0.990756,0.998319
3,dataset-xquad_en_train,model-xquad_student_supported_langs,0.468067,0.668908,0.981513,0.89916,0.945378,0.998319
4,dataset-xquad_en_train,model-xorqa_student_supported_langs,0.712605,0.851261,0.990756,0.968067,0.986555,0.997479
5,dataset-xquad_en_train,model-mlqa_student_supported_langs,0.737815,0.884034,0.993277,0.978992,0.989076,0.997479
6,dataset-xquad_en_train,model-xquad_student_unsupported_langs,0.401681,0.620168,0.968908,0.883193,0.941176,0.993277
7,dataset-xquad_en_train,model-xorqa_student_unsupported_langs,0.689076,0.840336,0.993277,0.962185,0.982353,0.997479
8,dataset-xquad_en_train,model-mlqa_student_unsupported_langs,0.717647,0.871429,0.993277,0.97563,0.988235,0.998319
