## 1. Import libraries and modules

In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import time
import random
import nltk
from tqdm import tqdm
import torch
import os
import sys
import random
import json
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/phaxssi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/phaxssi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
sys.path.append(os.path.abspath('../funcs'))
import functions as fn

## 2. Import and process the RGB dataset

In [3]:
url = "https://raw.githubusercontent.com/chen700564/RGB/master/data/en.json"
data = fn.process_json(url)
data = random.sample(data, 5)
queries = [item["query"] for item in data]
answers = [item["answer"][0] for item in data]

## 3. Run Extractive Open Source Models

We set local variables

In [4]:
# Set up noise thresholds
stride = 20
noise_thresholds = fn.get_noise_levels(stride)

# Set up device
device = 0 if torch.cuda.is_available() else -1

# Set up models and separator
ext_model_1 = "deepset/xlm-roberta-large-squad2"
ext_model_2 = "mrm8488/bert-multi-cased-finedtuned-xquad-tydiqa-goldp"
ext_model_3 = "distilbert/distilbert-base-cased-distilled-squad"
ext_model_4 = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
models = [ ext_model_1, ext_model_2, ext_model_3, ext_model_4 ]
separator = " <|> "

# Load the model mapping
with open('../config/models_mapping.json', 'r') as f:
    model_mapping = json.load(f)

# Set up max tokens
max_tokens = 1500

# Number of experiments
num_experiments = 5

# Set up lists to store the execution times
execution_times = []
overall_times = []

# Set up paths
input_paths = '../results/ext'
output_paths = '../metrics/ext'
os.makedirs(input_paths, exist_ok=True)
os.makedirs(output_paths, exist_ok=True)

Now we run the models

In [5]:
for model in models:

    model_start_time = time.time()
    tokenizer = AutoTokenizer.from_pretrained(model)
    loaded_model = AutoModelForQuestionAnswering.from_pretrained(model)
    qa_pipeline = pipeline("question-answering", model=loaded_model, tokenizer=tokenizer, device=device)

    model_times = []
    
    for exp_num in range(1, num_experiments + 1):

        random.seed(2024 + exp_num)

        results = []
        exp_start_time = time.time()
        for query, positive_context, negative_context, answer in tqdm(zip(queries, [item["positive"] for item in data], [item["negative"] for item in data], answers), total=len(queries)):
            result = {
                'Query': query,
                'Correct Answer': answer,
            }
            for noise_level, value in noise_thresholds.items():
                noise_start_time = time.time()
                
                mixed_context = fn.create_mixed_context(positive_context, negative_context, value, max_tokens, separator)
                context_result = fn.process_context(mixed_context, qa_pipeline, query, separator)
                context_result.pop('Appended Context', None)
                context_result.pop('Context Interval', None)
                result.update({noise_level + ' ' + k: v for k, v in context_result.items()})
                result[f'Jaccard {noise_level}'] = fn.apply_jaccard(result, f'{noise_level} Predicted Answer', 'Correct Answer')
                result[f'Cosine {noise_level}'] = fn.apply_cosine(result, f'{noise_level} Predicted Answer', 'Correct Answer')
                result[f'EM {noise_level}'] = fn.apply_exact_match(result, f'{noise_level} Predicted Answer', 'Correct Answer')
                result[f'EM - 2V {noise_level}'] = fn.apply_exact_match_2v(result, f'{noise_level} Predicted Answer', 'Correct Answer')
                
                noise_end_time = time.time()
                noise_times = noise_end_time - noise_start_time
                
                execution_times.append({
                    'Model': model,
                    'Noise Level': noise_level,
                    'Average Time': noise_times,
                    'Standard Deviation': 0
                })

            results.append(result)
        
        results_df = pd.DataFrame(results)
        filename_results = os.path.join(input_paths, f"exp_{exp_num}_{model.split('/')[1]}.json")
        results_df.to_json(filename_results, orient='records', lines=True)
        exp_end_time = time.time()
        model_times.append(exp_end_time - exp_start_time)
    
    model_end_time = time.time()
    
    avg_time = np.mean(model_times)
    std_time = np.std(model_times)

    print(f"Execution time for model {model}: {(model_end_time - model_start_time) / 60:.2f} minutes.")

    overall_times.append({
        'Model': model,
        'Average Time': avg_time,
        'Standard Deviation': std_time
    })

execution_times_df = pd.DataFrame(execution_times)
overall_times_df = pd.DataFrame(overall_times)
filename_exec_time = os.path.join(input_paths, "exec_time.xlsx")
with pd.ExcelWriter(filename_exec_time, engine='xlsxwriter') as writer:
    execution_times_df.to_excel(writer, sheet_name='Noise Level Times', index=False)
    overall_times_df.to_excel(writer, sheet_name='Overall Model Times', index=False)

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/5 [00:01<?, ?it/s]


KeyboardInterrupt: 

## 4. Compute Metrics

Compute all metrics

In [28]:
output_file = os.path.join(output_paths, 'all_metrics.xlsx')
all_results = {}

files = os.listdir(input_paths)
for i, file in enumerate(tqdm(files, desc="Processing files"), start=1):
    if file.endswith('.json'):
        experiment_num = int(file.split('_')[1]) 
        model_name = file.split('_')[2].replace('.json', '') 
        sheet_name = model_mapping.get(model_name, model_name)
        input_path = os.path.join(input_paths, file)
        result_df = fn.compute_metrics(input_path, stride) 
        result_df.insert(0, 'Experiment Number', experiment_num)
        if sheet_name not in all_results:
            all_results[sheet_name] = result_df
        else:
            all_results[sheet_name] = pd.concat([all_results[sheet_name], result_df], ignore_index=True)

with pd.ExcelWriter(output_file) as writer:
    for sheet_name, result_df in all_results.items():
        result_df.to_excel(writer, sheet_name=sheet_name, index=False)

Processing files: 100%|██████████| 21/21 [01:17<00:00,  3.70s/it]


Compute mean metrics

In [30]:
input_file = '../metrics/ext/all_metrics.xlsx'
output_file = '../metrics/ext/final_metrics.xlsx'
final_results = {}

excel_data = pd.read_excel(input_file, sheet_name=None)

for sheet_name, df in excel_data.items():
    if 'Experiment Number' not in df.columns:
        raise ValueError(f"'Experiment Number' column not found in sheet {sheet_name}")
    metrics = df['Metric'].unique()
    noise_levels = list(noise_thresholds.keys())
    result_data = {
        'Metric': metrics,
    }
    for noise_level in noise_levels:
        result_data[f'{noise_level}_Mean'] = []
        result_data[f'{noise_level}_Std'] = []
        for metric in metrics:
            metric_df = df[df['Metric'] == metric]
            result_data[f'{noise_level}_Mean'].append(metric_df[noise_level].mean())
            result_data[f'{noise_level}_Std'].append(metric_df[noise_level].std())
    result_df = pd.DataFrame(result_data)
    final_results[sheet_name] = result_df

with pd.ExcelWriter(output_file) as writer:
    for sheet_name, result_df in final_results.items():
        result_df.to_excel(writer, sheet_name=sheet_name, index=False)

# 5. Plots

Build data

In [7]:
metric_name = 'EM - 2V'
noise_thresholds = fn.get_noise_levels(stride)
ext_file_path = '../metrics/ext/final_metrics.xlsx'
gen_file_path = '../metrics/gen/final_metrics.xlsx'

ext_data = fn.extract_metrics_from_excel(ext_file_path, metric_name, model_mapping, noise_thresholds)
gen_data = fn.extract_metrics_from_excel(ext_file_path, metric_name, model_mapping, noise_thresholds)