In [1]:
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from datasets import load_dataset
from accelerate import Accelerator, notebook_launcher
import evaluate
import collections
from tqdm.auto import tqdm


In [2]:
# Reformat a JSON file and save the result to a new file
def reformat_and_save_json(json_file):
    try:
        with open(json_file, 'r') as f:
            json_data = json.load(f)
    except IOError as e:
        print(f"Error opening {json_file}: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {json_file}: {e}")
        return None

    examples = []
    for elem in json_data['data']:
        title = elem['title'].strip()
        for paragraph in elem['paragraphs']:
            context = paragraph['context'].strip()
            for qa in paragraph['qas']:
                example = {'id': qa['id'], 'title': title, 'context': context, 'question': qa['question'].strip(), 'answers': {'answer_start': [answer["answer_start"] for answer in qa['answers']], 'text': [answer["text"] for answer in qa['answers']]}}
                examples.append(example)
    
    out_dict = {'data': examples}
    output_json_file = os.path.join(os.path.dirname(json_file), 'formatted_' + os.path.basename(json_file))

    try:
        with open(output_json_file, 'w') as f:
            json.dump(out_dict, f)
    except IOError as e:
        print(f"Error writing to {output_json_file}: {e}")
        return None

    return output_json_file

data_paths = {
    'train': '/home/qiaoyiy/8430/models/spoken_train-v1.1.json',
    'validation': '/home/qiaoyiy/8430/models/spoken_test-v1.1.json',
    'test_WER44': '/home/qiaoyiy/8430/models/spoken_test-v1.1_WER44.json',
    'test_WER54': '/home/qiaoyiy/8430/models/spoken_test-v1.1_WER54.json'
}

# Iterate over the paths of the original data files using a dictionary comprehension
# and pass each file path to the reformat_and_save_json function.
# This will return a new dictionary containing the paths of the processed files.
formatted_data_paths = {key: reformat_and_save_json(path) for key, path in data_paths.items() if reformat_and_save_json(path)}

# Now, formatted_data_paths contains the paths of the processed files.
# Use these paths to load the dataset.
spoken_squad_dataset = load_dataset('json', data_files=formatted_data_paths, field='data')
print("Loading SpokenSQuAD data completed")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test_WER44 split: 0 examples [00:00, ? examples/s]

Generating test_WER54 split: 0 examples [00:00, ? examples/s]

Loading SpokenSQuAD data completed


In [3]:
# Load the model and tokenizer
model_checkpoint = "bert-base-uncased" 
print(f"Loading the BERT model and tokenizer from checkpoint '{model_checkpoint}'...") 

# Load the pre-trained Question Answering model from the specified checkpoint.
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
print(f"Model '{model_checkpoint}' successfully loaded for Question Answering tasks.")

# Load the tokenizer associated with the specified checkpoint.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"Tokenizer for model '{model_checkpoint}' successfully loaded.")


Loading the BERT model and tokenizer from checkpoint 'bert-base-uncased'...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 'bert-base-uncased' successfully loaded for Question Answering tasks.
Tokenizer for model 'bert-base-uncased' successfully loaded.


In [4]:
# Define data preprocessing functions
max_length = 384  # Maximum length of the tokenized input sequences
stride = 64  # The stride size for splitting long documents into chunks

def preprocess_training_examples(examples):
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # find start and end of the context
        idx = 0
        while sequence_ids[idx] != 1: 
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if answer not fully inside context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs


def process_validation_examples(examples):
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    sample_map = inputs.pop('overflow_to_sample_mapping')
    example_ids = []

    for i in range(len(inputs['input_ids'])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offsets = inputs['offset_mapping'][i]
        inputs["offset_mapping"][i] = [
            offset if sequence_ids[k] == 1 else None for k, offset in enumerate(offsets)
        ]

    inputs['example_id'] = example_ids
    return inputs


print("Starting preprocessing of training data with tokenization and extraction of answer positions...")

train_dataset = spoken_squad_dataset['train'].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['train'].column_names
)

print("Tokenization and preprocessing of validation dataset (clean data, 22.73% WER) underway...")

validation_dataset = spoken_squad_dataset['validation'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['validation'].column_names
)

print("Preprocessing test dataset with moderate noise level (44.22% WER) for evaluation...")

test_WER44_dataset = spoken_squad_dataset['test_WER44'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER44'].column_names
)

print("Preprocessing test dataset with high noise level (54.82% WER) for robustness assessment...")

test_WER54_dataset = spoken_squad_dataset['test_WER54'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER54'].column_names
)


Starting preprocessing of training data with tokenization and extraction of answer positions...


Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

Tokenization and preprocessing of validation dataset (clean data, 22.73% WER) underway...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing test dataset with moderate noise level (44.22% WER) for evaluation...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing test dataset with high noise level (54.82% WER) for robustness assessment...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

In [5]:
# Convert the datasets to a format compatible with PyTorch models.

train_dataset.set_format("torch")
print("Converted the training dataset to PyTorch tensor format.")

validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
print("Prepared the validation dataset by removing 'example_id' and 'offset_mapping' columns and converting to PyTorch tensor format.")

test_WER44_set = test_WER44_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER44_set.set_format("torch")
print("Prepared the Test WER44 dataset (simulating 44% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.")

test_WER54_set = test_WER54_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER54_set.set_format("torch")
print("Prepared the Test WER54 dataset (simulating 54% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.")

print("Initializing the DataLoader for the training dataset with shuffling and a batch size of 8 to ensure varied mini-batch combinations during training.")
train_dataloader = DataLoader(
    train_dataset, 
    shuffle = True, 
    collate_fn=default_data_collator, 
    batch_size=8
)

print("Initializing the DataLoader for the validation dataset with a batch size of 8 for model performance evaluation on unseen clean data.")
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)
print("Initializing the DataLoader for the Test WER44 dataset with a batch size of 8 to evaluate model robustness under moderate noise conditions.")
test_WER44_dataloader = DataLoader(
    test_WER44_set, collate_fn=default_data_collator, batch_size=8
)
print("Initializing the DataLoader for the Test WER54 dataset with a batch size of 8 to evaluate model robustness under high noise conditions.")
test_WER54_dataloader = DataLoader(
    test_WER54_set, collate_fn=default_data_collator, batch_size=8
)


Converted the training dataset to PyTorch tensor format.
Prepared the validation dataset by removing 'example_id' and 'offset_mapping' columns and converting to PyTorch tensor format.
Prepared the Test WER44 dataset (simulating 44% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.
Prepared the Test WER54 dataset (simulating 54% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.
Initializing the DataLoader for the training dataset with shuffling and a batch size of 8 to ensure varied mini-batch combinations during training.
Initializing the DataLoader for the validation dataset with a batch size of 8 for model performance evaluation on unseen clean data.
Initializing the DataLoader for the Test WER44 dataset with a batch size of 8 to evaluate model robustness under moderate noise conditions.
Initializing the DataLoader for the Test WER54 dataset with a batch size of 8 to evaluate model robustness under high n

In [6]:
# Define evaluation metrics and evaluation function
metric = evaluate.load("squad")  # Load the SQuAD evaluation metric

n_best = 20  # Number of top predictions to consider for each example
max_answer_length = 30  # Maximum length of an answer that can be generated

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)  # Map each example_id to its corresponding features
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)
    
    predicted_answers = []
    for example in tqdm(examples):  # Iterate through each example
        example_id = example["id"]
        context = example["context"]
        answers = []
        
        # Loop through all features associated with an example ID
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]  # Start logit scores for this feature
            end_logit = end_logits[feature_index]  # End logit scores for this feature
            offsets = features[feature_index]["offset_mapping"]  # Token offsets for this feature
            
            # Get indices of the n_best start and end logits
            start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully within the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with invalid lengths
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    
                    # Construct an answer candidate
                    answer = {
                        "text": context[offsets[start_index][0]: offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index]
                    }
                    answers.append(answer)
        
        # Select the answer with the highest logit score
        if answers:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    
    # Compare predicted answers with the actual answers
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)


In [7]:
def train_model(model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, epochs=3):
    print(f"Starting model training for {epochs} epochs, each with {len(train_dataloader)} batches.")
    
    training_steps = epochs * len(train_dataloader)  # Total training steps calculation

    # Initialize the Accelerator for mixed precision training
    accelerator = Accelerator(mixed_precision='fp16')
    print("Accelerator initialized for mixed precision ('fp16') training.")

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)
    print("Optimizer setup with learning rate 2e-5.")

    # Prepare model, optimizer, and dataloaders for Accelerator
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    # Learning rate scheduler initialization
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=training_steps,
    )

    progress_bar = tqdm(range(training_steps), desc="Training Progress")  # Training progress bar setup

    for epoch in range(epochs):  # Loop over epochs
        print(f"\nEpoch {epoch+1}/{epochs} - Training:")
        model.train()  # Set model to training mode
        for step, batch in enumerate(train_dataloader):  # Iterate over training batches
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Model evaluation at the end of each epoch
        print("\nEvaluating model performance on the validation set...")
        metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'], accelerator)
        print(f"Validation Results - Epoch {epoch+1}: {metrics}")

        # Save the model and tokenizer at the end of each epoch
        output_dir = f"./model_save/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model_to_save = accelerator.unwrap_model(model)
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Model and tokenizer saved in '{output_dir}'")

    print("\nTraining completed successfully.")


In [8]:
def evaluate_model(model, dataloader, dataset, dataset_before_preprocessing, accelerator=None):
    if not accelerator:
        print("Initializing Accelerator for mixed precision (fp16) evaluation...")
        accelerator = Accelerator(mixed_precision='fp16')
        model, dataloader = accelerator.prepare(model, dataloader)
    
    print("Setting the model to evaluation mode for performance assessment...")
    model.eval()
    start_logits, end_logits = [], []

    print("Evaluating model performance on the dataset...")
    for batch in tqdm(dataloader, desc="Evaluation Progress"):
        with torch.no_grad():
            outputs = model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    # Concatenate and truncate logits to align with the dataset size
    start_logits, end_logits = np.concatenate(start_logits)[:len(dataset)], np.concatenate(end_logits)[:len(dataset)]

    print("Computing evaluation metrics based on model predictions...")
    metrics = compute_metrics(start_logits, end_logits, dataset, dataset_before_preprocessing)
    
    return metrics

print("Initiating model fine-tuning process...")
notebook_launcher(train_model, num_processes=1)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initiating model fine-tuning process...
Launching training on one GPU.
Starting model training for 3 epochs, each with 4664 batches.
Accelerator initialized for mixed precision ('fp16') training.
Optimizer setup with learning rate 2e-5.


Training Progress:   0%|          | 0/13992 [00:00<?, ?it/s]


Epoch 1/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Validation Results - Epoch 1: {'exact_match': 61.50252289291721, 'f1': 72.72473191218755}
Model and tokenizer saved in './model_save/epoch_1'

Epoch 2/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Validation Results - Epoch 2: {'exact_match': 63.1096991216595, 'f1': 73.69561038566222}
Model and tokenizer saved in './model_save/epoch_2'

Epoch 3/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Validation Results - Epoch 3: {'exact_match': 63.65165389646795, 'f1': 74.06165140185816}
Model and tokenizer saved in './model_save/epoch_3'

Training completed successfully.


In [11]:
# Results Evaluation

test_metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'])
test_wer44_metrics = evaluate_model(model, test_WER44_dataloader, test_WER44_dataset, spoken_squad_dataset['test_WER44'])
test_wer54_metrics = evaluate_model(model, test_WER54_dataloader, test_WER54_dataset, spoken_squad_dataset['test_WER54'])

print("\n=== Model Evaluation Summary ===\n")

# Headers for the results table
headers = ["Dataset", "Exact Match", "F1 Score"]
header_line = '| {:^15} | {:^12} | {:^8} |'.format(*headers)
separator = '+' + '-' * 17 + '+' + '-' * 14 + '+' + '-' * 10 + '+'

# Print table header
print(separator)
print(header_line)
print(separator)

# Function to format and print each row of the results table
def print_result_row(description, metrics):
    row = '| {:<15} | {:>10.2f}% | {:>6.2f}% |'.format(description, metrics['exact_match'], metrics['f1'])
    print(row)
    print(separator)

# Print each result row
print_result_row("Validation Set", test_metrics)
print_result_row("Test WER44 Set", test_wer44_metrics)
print_result_row("Test WER54 Set", test_wer54_metrics)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/679 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/679 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]


=== Model Evaluation Summary ===

+-----------------+--------------+----------+
|     Dataset     | Exact Match  | F1 Score |
+-----------------+--------------+----------+
| Validation Set  |      63.65% |  74.06% |
+-----------------+--------------+----------+
| Test WER44 Set  |      40.83% |  55.48% |
+-----------------+--------------+----------+
| Test WER54 Set  |      28.41% |  42.12% |
+-----------------+--------------+----------+
