In [30]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import torch
import random
import numpy as np
import pandas as pd
import nltk 
from datasets import Dataset

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

train_data  = pd.read_parquet("dataset/train.parquet")
test_data   = pd.read_parquet("dataset/validation.parquet")




In [11]:
train_data.loc[(train_data["answer_inlang"].notnull())]

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
15076,অস্ট্রেলীয় ক্রিকেটার ডেভিড অ্যান্ড্রু ওয়ার্ন...,David Andrew Warner (; born 27 October 1986) i...,bn,True,28,27 October 1986,27 অক্টোবর 1986
15077,আচেহ সালতানাতের তৃতীয় সুলতান কে ছিলেন ?,Sultan Salahuddin (died 25 November 1548) was ...,bn,False,-1,Alauddin,আলাউদ্দিন
15078,কত সালে আডলফ হিটলারের মূল ভাস্কর্যটি মাদাম তুস...,"In July 2008, the Berlin branch of Madame Tuss...",bn,True,511,1933,1933
15079,কোন সালে প্রথম ব্লেড আবিষ্কৃত হয় ?,The first step towards a safer-to-use razor wa...,bn,False,-1,1700 BC,1700 খ্রিস্টপূর্ব
15080,জ্যোতির্বিজ্ঞানী রাধাগোবিন্দ চন্দ্র প্রথম জীবন...,"Radha Gobind Chandra (16 July 1878, Bagchar vi...",bn,False,-1,Jessore Collectorate Office,যশোর কালেক্টরেট অফিস
...,...,...,...,...,...,...,...
15321,కోళ్లు ఎక్కువగా ఏ దేశంలో కనిపిస్తాయి?,"Since time immemorial, man has been practicing...",te,False,-1,United States of America,అమెరికా సంయుక్త రాష్ట్రాలు
15322,క్షయ వ్యాధికి విరుగుడు ఏ దేశంలో కనుగొన్నారు?,Vaccines against anthrax for use in livestock ...,te,False,-1,France,ఫ్రాన్స్
15323,ఖురాన్ ఏ అరబ్బీ భాషలో ఎవరు రాసారు?,are broken Other Names of the Qur'an: It is be...,te,False,-1,Prophet Muhammad,ముహమ్మద్ ప్రవక్త
15324,టెక్సస్ రాష్ట్రంలోని అతిపెద్ద మానవ నిర్మితం ఏది ?,Austin is the capital of the US state of Texas...,te,False,-1,JP Morgan Chase Tower,జేపీ మోర్గాన్ ఛేజ్ టవర్


In [12]:
train_with_answer_inlang = train_data.loc[(train_data["answer_inlang"].notna())]
val_with_answer_inlang = test_data.loc[(test_data["answer_inlang"].notna())]


train_data = train_with_answer_inlang[train_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question","context","answer_inlang"]]
val_data = val_with_answer_inlang[val_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question","context","answer_inlang"]]



In [13]:
def bootstrap(df, n_samples=10, sample_size=None, random_state=None):
    np.random.seed(random_state)  # Set random seed for reproducibility
    sample_size = sample_size or len(df)
    bootstrapped_samples = []

    for _ in range(n_samples):
        # Sample with replacement from the DataFrame
        sample = df.sample(n=sample_size, replace=True, random_state=np.random.randint(0, 1e6))
        bootstrapped_samples.append(sample)
    bootstrapped_df = pd.concat(bootstrapped_samples, ignore_index=True)

    return bootstrapped_df

train_data = bootstrap(train_data)
val_data = bootstrap(val_data)

In [14]:
print(train_data)

                                               question  \
0               Где находится Государственный архив РФ?   
1            Какая площадь потолка Сикстинской капеллы?   
2     Когда вышел фильм «Мсти́тели» на российские эк...   
3                            日本で学術研究全てを監視を統括する一つの機関はある？   
4                         Missä Maskun kunta sijaitsee?   
...                                                 ...   
1495                                    Croteamはいつ設立した？   
1496                     Miten linnalääni määritellään?   
1497                             日本の自衛隊でオリエンテーリングは行われる？   
1498              Minä vuonna Tom Fletcher on syntynyt?   
1499  Minä vuonna HMS Belvoir sijoitettiin poistolis...   

                                                context  \
0     the title "Central State Archive of the Octobe...   
1     Ceiling of the Sistine Chapel - The painting o...   
2     films were handled by Paramount, in contrast t...   
3     A graduate school is the basic organization of...

## Question + context model

In [15]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()



In [16]:
train_data = train_data.reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_data)

val_data = val_data.reset_index(drop=True)
val_dataset = Dataset.from_pandas(val_data)


def preprocess_function(row):
    inputs = [f"question: {q} context: {c}" for q, c in zip(row["question"], row["context"])]
    targets = row["answer_inlang"]


    model_inputs = tokenizer(inputs, max_length=512, truncation=True,  padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

print("Original dataset size:", len(train_data))  # Before tokenization
print("Tokenized dataset size:", len(train_dataset))  # After tokenization

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Original dataset size: 1500
Tokenized dataset size: 1500


In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # Optional additional arguments:
    fp16=True                        # If you have GPU with mixed precision support
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer
)



In [18]:
training_output = trainer.train()

  0%|          | 0/564 [00:00<?, ?it/s]

  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 39.53197479248047, 'eval_runtime': 372.0852, 'eval_samples_per_second': 8.063, 'eval_steps_per_second': 1.008, 'epoch': 1.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 32.2401237487793, 'eval_runtime': 372.8057, 'eval_samples_per_second': 8.047, 'eval_steps_per_second': 1.006, 'epoch': 2.0}
{'loss': 46.7583, 'grad_norm': 332.6312561035156, 'learning_rate': 1.1347517730496454e-06, 'epoch': 2.66}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 30.537431716918945, 'eval_runtime': 400.0355, 'eval_samples_per_second': 7.499, 'eval_steps_per_second': 0.937, 'epoch': 3.0}
{'train_runtime': 3922.6066, 'train_samples_per_second': 1.147, 'train_steps_per_second': 0.144, 'train_loss': 45.91505810893174, 'epoch': 3.0}


In [22]:
evaluation_metrics = trainer.evaluate()
print("Evaluation Metrics:", evaluation_metrics)

  0%|          | 0/375 [00:00<?, ?it/s]

Evaluation Metrics: {'eval_loss': 30.537431716918945, 'eval_runtime': 397.5498, 'eval_samples_per_second': 7.546, 'eval_steps_per_second': 0.943, 'epoch': 3.0}


In [21]:
def generate_answer(question, context):
    model_path = "./results/checkpoint-57"  # Adjust to your model checkpoint
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    
    # Format input with question and context for better performance
    input_text = f"question: {question} context: {context}"
    
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    
    # Generate answer with tuned parameters
    output_ids = model.generate(
        **inputs,
        max_length=64,
        num_beams=6,
        early_stopping=True,
        length_penalty=2.0,
        temperature=0.7,
        top_k=50
    )
    
    # Decode the output tokens to text
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

index = 0
question = train_data.iloc[index,:]["question"]
context = train_data.iloc[index,:]["context"]
answer = generate_answer(question, context)

print("Generated Answer:", answer)


Generated Answer: <extra_id_0> of Russia


## Question model

In [35]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load datasets
train_data = pd.read_parquet("dataset/train.parquet")
test_data = pd.read_parquet("dataset/validation.parquet")

# Filter data for in-language answers and specific languages
train_with_answer_inlang = train_data.loc[train_data["answer_inlang"].notna()]
val_with_answer_inlang = test_data.loc[test_data["answer_inlang"].notna()]

train_data = train_with_answer_inlang[train_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question", "context", "answer_inlang"]]
val_data = val_with_answer_inlang[val_with_answer_inlang["lang"].isin(["fi", "ru", "ja"])][["question", "context", "answer_inlang"]]

# Reset indices and convert to Hugging Face Dataset format
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess function for Seq2SeqTrainer
def preprocess_function(row):
    inputs = [f"question: {q} context: {c}" for q, c in zip(row["question"], row["context"])]
    targets = row["answer_inlang"]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True  # Optional, if GPU with mixed precision support
)

# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer
)

# Train model and calculate training loss
train_output = trainer.train()
training_loss = train_output.training_loss
print("Training Loss:", training_loss)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]



  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4162.9991, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 1.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4152.8826, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 2.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 4145.6745, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.009, 'epoch': 3.0}
{'train_runtime': 95539.6994, 'train_samples_per_second': 0.005, 'train_steps_per_second': 0.001, 'train_loss': 7.942982456140351, 'epoch': 3.0}
Training Loss: 7.942982456140351


In [36]:

evaluation_metrics = trainer.evaluate()
print("Evaluation Metrics:", evaluation_metrics)

  0%|          | 0/38 [00:00<?, ?it/s]

KeyboardInterrupt: 

I first trained mt5-small and it gave a training loss of around 60, which is very bad. I assumed it happened because the subset with questions that had an answer in the same language was very small, so I performed bootstrap. It helped a little bit, reducing the error to approximately 45. The second model is t5-small performed without bootstrap, which took orders of magnitude more time to train, but offerent a training loss much much smaller, of just 7. 