In [2]:
from transformers import pipeline
import torch
import random
import numpy as np
import pandas as pd
import nltk
!pip install datasets
from datasets import Dataset

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

train_data  = pd.read_parquet("/content/train.parquet")
test_data   = pd.read_parquet("/content/validation.parquet")






In [3]:
train_data.loc[(train_data["answer_inlang"].notnull())]

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
15076,অস্ট্রেলীয় ক্রিকেটার ডেভিড অ্যান্ড্রু ওয়ার্ন...,David Andrew Warner (; born 27 October 1986) i...,bn,True,28,27 October 1986,27 অক্টোবর 1986
15077,আচেহ সালতানাতের তৃতীয় সুলতান কে ছিলেন ?,Sultan Salahuddin (died 25 November 1548) was ...,bn,False,-1,Alauddin,আলাউদ্দিন
15078,কত সালে আডলফ হিটলারের মূল ভাস্কর্যটি মাদাম তুস...,"In July 2008, the Berlin branch of Madame Tuss...",bn,True,511,1933,1933
15079,কোন সালে প্রথম ব্লেড আবিষ্কৃত হয় ?,The first step towards a safer-to-use razor wa...,bn,False,-1,1700 BC,1700 খ্রিস্টপূর্ব
15080,জ্যোতির্বিজ্ঞানী রাধাগোবিন্দ চন্দ্র প্রথম জীবন...,"Radha Gobind Chandra (16 July 1878, Bagchar vi...",bn,False,-1,Jessore Collectorate Office,যশোর কালেক্টরেট অফিস
...,...,...,...,...,...,...,...
15321,కోళ్లు ఎక్కువగా ఏ దేశంలో కనిపిస్తాయి?,"Since time immemorial, man has been practicing...",te,False,-1,United States of America,అమెరికా సంయుక్త రాష్ట్రాలు
15322,క్షయ వ్యాధికి విరుగుడు ఏ దేశంలో కనుగొన్నారు?,Vaccines against anthrax for use in livestock ...,te,False,-1,France,ఫ్రాన్స్
15323,ఖురాన్ ఏ అరబ్బీ భాషలో ఎవరు రాసారు?,are broken Other Names of the Qur'an: It is be...,te,False,-1,Prophet Muhammad,ముహమ్మద్ ప్రవక్త
15324,టెక్సస్ రాష్ట్రంలోని అతిపెద్ద మానవ నిర్మితం ఏది ?,Austin is the capital of the US state of Texas...,te,False,-1,JP Morgan Chase Tower,జేపీ మోర్గాన్ ఛేజ్ టవర్


In [4]:
train_with_answer_inlang = train_data.loc[(train_data["answer_inlang"].notna())]
val_with_answer_inlang = test_data.loc[(test_data["answer_inlang"].notna())]


train_data = train_with_answer_inlang[train_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question","context","answer_inlang"]]
val_data = val_with_answer_inlang[val_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question","context","answer_inlang"]]



In [5]:
def bootstrap(df, n_samples=10, sample_size=None, random_state=None):
    np.random.seed(random_state)  # Set random seed for reproducibility
    sample_size = sample_size or len(df)
    bootstrapped_samples = []

    for _ in range(n_samples):
        # Sample with replacement from the DataFrame
        sample = df.sample(n=sample_size, replace=True, random_state=np.random.randint(0, 1e6))
        bootstrapped_samples.append(sample)
    bootstrapped_df = pd.concat(bootstrapped_samples, ignore_index=True)

    return bootstrapped_df

train_data = bootstrap(train_data)
val_data = bootstrap(val_data)

In [6]:
print(train_data)

                                               question  \
0                       『クィア・アズ・フォーク』のリメイク版のタイトルはなんですか？   
1                             オーロラ号の漂流時、オーロラ号に乗組員は何人いた？   
2                                 アメリカ合衆国に初めて到達した西欧人は誰？   
3                          『ポケットモンスター 赤・緑』の主人公の出身地の名前は？   
4                                     カトリックでは避妊を禁止している？   
...                                                 ...   
1495              За какую команду выступал Ра́йкконен?   
1496                             日本の自衛隊でオリエンテーリングは行われる？   
1497                  Где базировалась «Армия Андерса»?   
1498  Сколько планет-гигантов есть в Солнечной системе?   
1499                                     PYGが最初に発表した曲は何   

                                                context answer_inlang  
0     The development of the story remains the same ...        クィア・アイ  
1     This was the last time. On her return to New Z...           yes  
2     The society nurtured its own culture, but comp...      ジョン・カボット  
3  

## Question + context model

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
train_data = train_data.reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_data)

val_data = val_data.reset_index(drop=True)
val_dataset = Dataset.from_pandas(val_data)


def preprocess_function(row):
    inputs = [f"question: {q} context: {c}" for q, c in zip(row["question"], row["context"])]
    targets = row["answer_inlang"]


    model_inputs = tokenizer(inputs, max_length=512, truncation=True,  padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

print("Original dataset size:", len(train_data))  # Before tokenization
print("Tokenized dataset size:", len(train_dataset))  # After tokenization

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Original dataset size: 1500
Tokenized dataset size: 1500


In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_checkpoint=True,
    max_grad_norm=1.0,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # Optional additional arguments:
    # fp16=True                        # If you have GPU with mixed precision support
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer
)



In [10]:
training_output = trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,65.783356
2,No log,64.796837
3,62.089200,64.453354


In [11]:
evaluation_metrics = trainer.evaluate()
print("Evaluation Metrics:", evaluation_metrics)

Evaluation Metrics: {'eval_loss': 64.45335388183594, 'eval_runtime': 18.9139, 'eval_samples_per_second': 158.613, 'eval_steps_per_second': 19.827, 'epoch': 3.0}


In [13]:
def generate_answer(question, context):
    model_path = "/content/results/checkpoint-564"  # Adjust to your model checkpoint
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Format input with question and context for better performance
    input_text = f"question: {question} context: {context}"

    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)

    # Generate answer with tuned parameters
    output_ids = model.generate(
        **inputs,
        max_length=64,
        num_beams=6,
        early_stopping=True,
        length_penalty=2.0,
        temperature=0.7,
        top_k=50
    )

    # Decode the output tokens to text
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

index = 0
question = train_data.iloc[index,:]["question"]
context = train_data.iloc[index,:]["context"]
answer = generate_answer(question, context)

print("Generated Answer:", answer)


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Generated Answer: <extra_id_0>.


In [14]:
generated_answers = []
for i in range(100):
  question = test_data.iloc[i]["question"]
  context = test_data.iloc[i]["context"]
  answer = generate_answer(question, context)
  generated_answers.append((i, answer))

generated_answers

[(0, '<extra_id_0>?'),
 (1, '<extra_id_0>?'),
 (2, '<extra_id_0>?'),
 (3, '<extra_id_0> of World War I'),
 (4, '<extra_id_0>?'),
 (5, '<extra_id_0> (2005)'),
 (6, '<extra_id_0>.com'),
 (7, '<extra_id_0>.'),
 (8, '<extra_id_0>?'),
 (9, '<extra_id_0> :'),
 (10, '<extra_id_0>.'),
 (11, '<extra_id_0> fields'),
 (12, '<extra_id_0>.com'),
 (13, '<extra_id_0>.'),
 (14, '<extra_id_0>.'),
 (15, '<extra_id_0>ం.'),
 (16, '<extra_id_0>.'),
 (17, '<extra_id_0> Uganda'),
 (18, '<extra_id_0>.'),
 (19, '<extra_id_0>.'),
 (20, '<extra_id_0>?'),
 (21, '<extra_id_0>.'),
 (22, '<extra_id_0>.'),
 (23, '<extra_id_0> of cancer'),
 (24, '<extra_id_0>.'),
 (25, '<extra_id_0>.'),
 (26, '<extra_id_0>.'),
 (27, '<extra_id_0>?'),
 (28, '<extra_id_0>.com'),
 (29, '<extra_id_0>లో అనువదించారు?'),
 (30, '<extra_id_0>.'),
 (31, '<extra_id_0>.'),
 (32, '<extra_id_0>.'),
 (33, '<extra_id_0>?'),
 (34, '<extra_id_0>?'),
 (35, '<extra_id_0> of Anton Siluanov'),
 (36, '<extra_id_0>.'),
 (37, '<extra_id_0>?'),
 (38, '<extra_i

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(model, tokenizer, test_data, target_column="answer_inlang"):
    """
    Calculate accuracy, precision, recall, and F1-score for the given model and tokenizer.

    Parameters:
    - model: Trained Hugging Face model to use for predictions.
    - tokenizer: Corresponding tokenizer for the model.
    - test_data: DataFrame containing test questions, contexts, and true answers.
    - target_column: Column name in test_data that contains the ground truth answers.

    Returns:
    - Dictionary containing accuracy, precision, recall, and F1-score.
    """

    predictions = []
    true_labels = test_data[target_column].tolist()

    for idx, row in test_data.iterrows():
        question = row["question"]
        context = row["context"]

        # Tokenize and generate prediction
        input_text = f"question: {question} context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        output_ids = model.generate(**inputs, max_length=64, num_beams=6, early_stopping=True, length_penalty=2.0, temperature=0.7, top_k=50)

        # Decode the prediction and append
        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(predicted_answer)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# This function `calculate_metrics` can be called with the trained model and tokenizer along with a test dataset.
# However, I will not execute it now, as it would require loading the model and predicting answers on the dataset.
# The function is ready to be used when desired.


In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define the model path
model_path = "/content/results/checkpoint-564"  # Update to the actual model checkpoint path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Call the function on the validation data
metrics = calculate_metrics(model, tokenizer, val_data)

# Print out the results
print("Evaluation Metrics:")
print(f"Accuracy:  {metrics['accuracy']:.2f}")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall:    {metrics['recall']:.2f}")
print(f"F1 Score:  {metrics['f1_score']:.2f}")

Evaluation Metrics:
Accuracy:  0.00
Precision: 0.00
Recall:    0.00
F1 Score:  0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Question model

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load datasets
train_data = pd.read_parquet("dataset/train.parquet")
test_data = pd.read_parquet("dataset/validation.parquet")

# Filter data for in-language answers and specific languages
train_with_answer_inlang = train_data.loc[train_data["answer_inlang"].notna()]
val_with_answer_inlang = test_data.loc[test_data["answer_inlang"].notna()]

train_data = train_with_answer_inlang[train_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question", "context", "answer_inlang"]]
val_data = val_with_answer_inlang[val_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question", "context", "answer_inlang"]]

# Reset indices and convert to Hugging Face Dataset format
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess function for Seq2SeqTrainer
def preprocess_function(row):
    inputs = [f"question: {q} context: {c}" for q, c in zip(row["question"], row["context"])]
    targets = row["answer_inlang"]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True  # Optional, if GPU with mixed precision support
)

# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer
)

# Train model and calculate training loss
train_output = trainer.train()
training_loss = train_output.training_loss
print("Training Loss:", training_loss)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]



  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4162.9991, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 1.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4152.8826, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 2.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4145.6745, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 3.0}
{'train_runtime': 95539.6994, 'train_samples_per_second': 0.005, 'train_steps_per_second': 0.001, 'train_loss': 7.942982456140351, 'epoch': 3.0}
Training Loss: 7.942982456140351


In [None]:

evaluation_metrics = trainer.evaluate()
print("Evaluation Metrics:", evaluation_metrics)

  0%|          | 0/38 [00:00<?, ?it/s]

KeyboardInterrupt: 

I first trained mt5-small and it gave a training loss of around 60, which is very bad. I assumed it happened because the subset with questions that had an answer in the same language was very small, so I performed bootstrap. It helped a little bit, reducing the error to approximately 45. The second model is t5-small performed without bootstrap, which took orders of magnitude more time to train, but offerent a training loss much much smaller, of just 7.