Ran on Google Colab A100 40GB Instance

In [None]:
%%capture
!pip install tuned-lens
!pip install googletrans

In [None]:
import re
import os
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
import json

os.environ["HF_TOKEN"] = ""

ROOT_DIR = "/content/drive/MyDrive/naomi-experiment"
SAVE_DIR = "/indeterminant"

if not os.path.exists(f"{ROOT_DIR}{SAVE_DIR}"):
    os.makedirs(f"{ROOT_DIR}{SAVE_DIR}")

In [None]:
import torch
from tuned_lens.nn.lenses import TunedLens, LogitLens
from transformers import AutoModelForCausalLM, AutoTokenizer
from tuned_lens.plotting import PredictionTrajectory

device = torch.device('cuda')
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.1-8B')
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [None]:
# lens = TunedLens.from_model_and_pretrained(model, "/content/drive/MyDrive/naomi-experiment/tuned-lens/v0.7/").to(device)
logit_lens = LogitLens.from_model(model).to(device)

In [None]:
def output_predictions(prompt, lens_list, min_answer_tokens=8):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    prompt_length = len(input_ids)

    input_tensor = torch.tensor([input_ids]).to(model.device)
    with torch.no_grad():
        output = model.generate(
            input_tensor,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    full_sequence = output[0].cpu().tolist()
    answer_tokens = full_sequence[prompt_length:]  # Only the generated tokens

    full_text = tokenizer.decode(full_sequence, skip_special_tokens=False)
    actual_answer_text = tokenizer.decode(answer_tokens, skip_special_tokens=False)

    targets = full_sequence[1:] + [tokenizer.eos_token_id]

    result = {
        "actual_text": actual_answer_text,
        "full_text": full_text,
    }

    for lens in lens_list:
        lens_predictions = []

        pred_traj = PredictionTrajectory.from_lens_and_model(
            lens=lens,
            model=model,
            input_ids=full_sequence,
            tokenizer=tokenizer,
            targets=targets,
        )

        log_probs = pred_traj.log_probs
        logits = np.exp(log_probs)
        predicted_tokens = logits.argmax(axis=-1)

        num_layers = predicted_tokens.shape[0]
        for layer_idx in range(num_layers):
            pred_tokens = predicted_tokens[layer_idx].astype(int).tolist()
            decoded_text = tokenizer.decode(pred_tokens, skip_special_tokens=False)
            lens_predictions.append(f"Layer {layer_idx}: {decoded_text}")

        result[lens.__class__.__name__] = lens_predictions

    return result

def output_last_prediction(prompt, lens_list, topk=10):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    prompt_length = len(input_ids)
    last_prompt_position = prompt_length - 1

    input_tensor = torch.tensor([input_ids]).to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_tensor,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    full_sequence = output[0].cpu().tolist()
    answer_tokens = full_sequence[prompt_length:]

    full_text = tokenizer.decode(full_sequence, skip_special_tokens=False)
    actual_answer_text = tokenizer.decode(answer_tokens, skip_special_tokens=False)

    # Targets for the full sequence (shifted by 1)
    targets = full_sequence[1:] + [tokenizer.eos_token_id]

    result = {
        "actual_text": actual_answer_text,
        "full_text": full_text,
        "prompt": prompt,
        "last_prompt_position": last_prompt_position
    }

    for lens in lens_list:
        # Get prediction trajectory
        pred_traj = PredictionTrajectory.from_lens_and_model(
            lens=lens,
            model=model,
            input_ids=full_sequence,
            tokenizer=tokenizer,
            targets=targets,
        )

        # log_probs shape: [num_layers, sequence_length, vocab_size]
        log_probs = pred_traj.log_probs

        # Convert to probabilities
        probs = np.exp(log_probs)

        # Extract predictions for last prompt position only
        # probs[:, last_prompt_position, :] shape: [num_layers, vocab_size]
        last_position_probs = probs[:, last_prompt_position, :]

        num_layers = last_position_probs.shape[0]
        layer_predictions = []

        for layer_idx in range(num_layers):
            # Get top-K for this layer
            layer_probs = last_position_probs[layer_idx]  # [vocab_size]
            top_k_indices = np.argsort(layer_probs)[-topk:][::-1]  # Descending order
            top_k_probs = layer_probs[top_k_indices]

            # Create list of predictions
            predictions = []
            for token_id, prob in zip(top_k_indices, top_k_probs):
                token_id = int(token_id)
                decoded_token = tokenizer.decode([token_id])
                predictions.append({
                    "token_id": token_id,
                    "token": decoded_token,
                    "confidence": float(prob)
                })

            layer_predictions.append({
                "layer": layer_idx,
                "predictions": predictions
            })

        result[lens.__class__.__name__] = layer_predictions

    return result

def output_first_generated_prediction(prompt, lens_list, topk=10):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    prompt_length = len(input_ids)
    first_generated_position = prompt_length  # Changed from prompt_length - 1

    input_tensor = torch.tensor([input_ids]).to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_tensor,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    full_sequence = output[0].cpu().tolist()
    answer_tokens = full_sequence[prompt_length:]

    full_text = tokenizer.decode(full_sequence, skip_special_tokens=False)
    actual_answer_text = tokenizer.decode(answer_tokens, skip_special_tokens=False)

    targets = full_sequence[1:] + [tokenizer.eos_token_id]

    result = {
        "actual_text": actual_answer_text,
        "full_text": full_text,
        "prompt": prompt,
        "first_generated_position": first_generated_position
    }

    for lens in lens_list:
        pred_traj = PredictionTrajectory.from_lens_and_model(
            lens=lens,
            model=model,
            input_ids=full_sequence,
            tokenizer=tokenizer,
            targets=targets,
        )

        log_probs = pred_traj.log_probs
        probs = np.exp(log_probs)

        # Extract predictions for first generated position
        first_gen_probs = probs[:, first_generated_position, :]

        num_layers = first_gen_probs.shape[0]
        layer_predictions = []

        for layer_idx in range(num_layers):
            layer_probs = first_gen_probs[layer_idx]
            top_k_indices = np.argsort(layer_probs)[-topk:][::-1]
            top_k_probs = layer_probs[top_k_indices]

            predictions = []
            for token_id, prob in zip(top_k_indices, top_k_probs):
                token_id = int(token_id)
                decoded_token = tokenizer.decode([token_id])
                predictions.append({
                    "token_id": token_id,
                    "token": decoded_token,
                    "confidence": float(prob)
                })

            layer_predictions.append({
                "layer": layer_idx,
                "predictions": predictions
            })

        result[lens.__class__.__name__] = layer_predictions

    return result

def get_prompt(text, language):
    return (    f"Answer the following question in EXACTLY ONE WORD ({language}). "
      "No explanations. No punctuation. No extra text.\n\n"
      f"Question: {text}\n"
      "Answer:")

def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    output = model.generate(
        input_ids,
        do_sample=False,           # greedy decoding
        temperature=0.0,           # deterministic
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
    return generated_text.split("\n")[0]

In [None]:
df_english = pd.read_json(f"{ROOT_DIR}/annotated_cultural_data_english.json").T
df_english.rename(columns={'english_answer': 'answers'}, inplace=True)

df_bengali = pd.read_json(f"{ROOT_DIR}/annotated_cultural_data.json").T
df_bengali.rename(columns={'bengali_answers': 'answers'}, inplace=True)

df_assamese = pd.read_json(f"{ROOT_DIR}/annotated_cultural_data_assam.json").T
df_assamese.rename(columns={'english_answer': 'answers'}, inplace=True)

df_spanish = pd.read_json(f"{ROOT_DIR}/annotated_cultural_data_spanish.json").T
df_spanish.rename(columns={'english_answers': 'answers', 'spanish_answers':'english_answers'}, inplace=True)

In [None]:
df_bengali.head()

Unnamed: 0,bengali_question,answers,original_assamese_question,original_english_question,timestamp
Al-en-01,বাংলাদেশের প্রি-স্কুল শিশুদের জন্য একটি সাধারণ...,"[পিঠা , ঝালমুড়ি , চিপস , ফুচকা ]",অসমত প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এ...,What is a common snack for preschool kids in A...,2025-09-14T12:02:34.365Z
Al-en-02,বাংলাদেশে বিয়ারের সাথে কোন খাবারটি জনপ্রিয়?,"[ঝালমুড়ি , সমুচা , সিঙারা ]",অসমত বিয়েৰৰ লগত খোৱা জনপ্ৰিয় খাদ্য সামগ্ৰীটো...,What is a popular food to go with beer in Assam?,2025-09-14T12:03:28.582Z
Al-en-04,বাংলাদেশের সবচেয়ে জনপ্রিয় ফল কোনটি?,"[আম , কাঁঠাল , নারিকেল , লিচু , কলা ]",অসমত কি ফল সবাতোকৈ জনপ্ৰিয়?,What is the most popular fruit in Assam?,2025-09-14T12:04:07.790Z
Al-en-06,বাংলাদেশের স্কুল ক্যাফেটেরিয়ার সাধারণ খাবার কী?,"[চা , সমুচা , সিঙারা , মম , ডাল , ভাত , সবজি ভ...",অসমৰ বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য...,What is a common school cafeteria food in Assam?,2025-09-14T12:05:54.329Z
Al-en-08,বাংলাদেশের শপিং মলে সবচেয়ে বেশি খাওয়া খাবার কী?,"[ফুচকা , রুটি , ডাল , ভাজি , ঝালমুড়ি ]",অসমৰ শ্বপিং মলসমূহত সচৰাচৰ খোৱা জনপ্ৰিয় খাদ্য...,What are the most commonly eaten snacks at sho...,2025-09-14T12:06:37.706Z


In [None]:
bn_variations = set()

for idx, row in tqdm(df_bengali.iterrows(), total=len(df_bengali)):
  if "বাংলা" in row.bengali_question:
    for word in row.bengali_question.split():
      if "বাংলা" in word:
        if word not in bn_variations:
          print(row.bengali_question)
        bn_variations.add(word)
print(bn_variations)

  0%|          | 0/409 [00:00<?, ?it/s]

বাংলাদেশের প্রি-স্কুল শিশুদের জন্য একটি সাধারণ খাবার কী?
বাংলাদেশে বিয়ারের সাথে কোন খাবারটি জনপ্রিয়?
অলিম্পিকে বাংলাদেশ কোন ক্রীড়া ইভেন্টে সবচেয়ে বেশি স্বর্ণপদক জিতেছে?
বাংলাদেশে, কোন গ্রেড স্তরে শিক্ষার্থীরা তাদের প্রথম জাতীয় স্তরের পরীক্ষা দেয়?
{'বাংলাদেশ', 'বাংলাদেশে,', 'বাংলাদেশে', 'বাংলাদেশের'}


In [None]:
df_english.head()

Unnamed: 0,english_question,answers,original_english_question
Al-en-01,What is a common snack for preschool kids in t...,"[fruit, sandwiches, vegetables, PB&J, peanut b...",What is a common snack for preschool kids in US?
Al-en-02,What is a popular food to go with beer in the US?,"[nuts, peanuts, pretzels, barbeque, BBQ, fried...",What is a popular food to go with beer in US?
Al-en-04,What is the most popular fruit in the US?,"[apple, pear, peach, strawberries, strawberry,...",What is the most popular fruit in US?
Al-en-06,What is a common school cafeteria food in the US?,"[pizza, sandwich, sandwiches, hamburger, burge...",What is a common school cafeteria food in US?
Al-en-08,What are the most commonly eaten snacks at sho...,"[pretzels, pretzels (Annie's), pretzel, Starbu...",What are the most commonly eaten snacks at sho...


In [None]:
en_variations = set()

for idx, row in tqdm(df_english.iterrows(), total=len(df_english)):
    matches = re.findall(r'\b(\w+\s+the\s+US)\b', row.english_question, re.IGNORECASE)
    for match in matches:
        if match not in en_variations:
            en_variations.add(match)
            print(f"New variation: '{match}' in question: {row.english_question}")

print(en_variations)

  0%|          | 0/409 [00:00<?, ?it/s]

New variation: 'in the US' in question: What is a common snack for preschool kids in the US?
New variation: 'from the US' in question: How many meals per day do people from the US usually have? (Provide Arabic numerals (e.g., 5) only.)
New variation: 'is the US' in question: How long (in weeks) is the US's maternity leave for mums? (Provide Arabic numerals (e.g., 20) only.)
New variation: 'has the US' in question: What sports event has the US won the most gold medals at the Olympics?
New variation: 'of the US' in question: What is the representative export item of the US?
New variation: 'besides the US' in question: How many languages are studied in elementary education besides the US's official language? (Provide in Arabic numerals (e.g., 7, 8) only.)
New variation: 'for the US' in question: Which country is considered the biggest rival in soccer matches for the US?
New variation: 'with the US' in question: When is the day with the US's most severe nationwide traffic congestion?
New v

In [None]:
df_spanish.head()

Unnamed: 0,spanish_question,english_answers,answers,english_question
Al-en-01,¿Cuál es un tentempié común para niños de pree...,"[fruit, piece of fruit, sandwich, cookies, bis...","[fruta, pieza de fruta, sandwich, bocadillo, g...",What is a common snack for preschool kids in S...
Al-en-02,¿Cuál es una comida popular para acompañar la ...,"[tapas, spicy potatoes, bag of crisps, peanuts...","[tapas, patatas bravas, papas bravas, patatas ...",What is a popular food to go with beer in Spain?
Al-en-04,¿Cuál es la fruta más popular en España?,"[orange, melon, banana, watermelon, apple]","[naranja, melon, platano, plátano, sandia, man...",What is the most popular fruit in Spain?
Al-en-06,¿Cuál es una comida común en las cafeterías es...,"[spanish omelette, croquettes, chicken, macaro...","[tortilla de patatas, tortilla, croquetas, pol...",What is a common school cafeteria food in Spain?
Al-en-08,¿Cuáles son los aperitivos más consumidos en l...,"[french fried, fries, ice cream, ice creams, f...","[patatas fritas, helado, helados, yogurt helad...",What are the most commonly eaten snacks at sho...


In [None]:
es_variations = set()

for idx, row in tqdm(df_spanish.iterrows(), total=len(df_spanish)):  # adjust column name as needed
    matches = re.findall(r'\b(?:\w+\s+)?España\b', row.spanish_question, re.IGNORECASE)
    for match in matches:
        if match not in es_variations:
            es_variations.add(match)
            print(f"New variation: '{match}' in question: {row.spanish_question}")

print(es_variations)

  0%|          | 0/409 [00:00<?, ?it/s]

New variation: 'en España' in question: ¿Cuál es un tentempié común para niños de preescolar en España?
New variation: 'de España' in question: ¿Cuál es una comida común en las cafeterías escolares de España?
New variation: 'ganado España' in question: ¿En qué evento deportivo ha ganado España más medallas de oro en los Juegos Olímpicos?
New variation: 'En España' in question: En España, ¿en qué curso los estudiantes hacen su primer examen de ámbito nacional?
{'en España', 'de España', 'En España', 'ganado España'}


In [None]:
df_assamese.head()

Unnamed: 0,english_question,answers,original_english_question
Al-en-01,অসমত প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এ...,"[ভীমকল, চুজি, পিঠাগুৰি, দৈ-চিৰা, আলু পিতিকা-ভা...",What is a common snack for preschool kids in A...
Al-en-02,অসমত বিয়েৰৰ লগত খোৱা জনপ্ৰিয় খাদ্য সামগ্ৰীটো...,"[বুট, বাদাম, মাংস, মাংস ভাজি, মটৰ, কেচা চানা, ...",What is a popular food to go with beer in Assam?
Al-en-04,অসমত কি ফল সবাতোকৈ জনপ্ৰিয়?,"[মালভোগ কল, কল, আম, কঠাল, বগৰী, নাৰিকল]",What is the most popular fruit in Assam?
Al-en-06,অসমৰ বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য...,"[চাহ, পৰঠা, ৰুটি-ভাজি, ৰঙা চাহ, গাখীৰ চাহ, কফী...",What is a common school cafeteria food in Assam?
Al-en-08,অসমৰ শ্বপিং মলসমূহত সচৰাচৰ খোৱা জনপ্ৰিয় খাদ্য...,"[ফুচকা, ম'ম', ম'ম, ম ' ম ', বিস্কুত, কুহিয়াৰ ৰ...",What are the most commonly eaten snacks at sho...


In [None]:
as_variations = set()

for idx, row in tqdm(df_assamese.iterrows(), total=len(df_assamese)):
  if "অসম" in row.english_question:
    # Find which word in the sentence contains this and add the whole word to variations set
    for word in row.english_question.split():
      if "অসম" in word:
        if word not in as_variations:
          print(row.english_question)
        as_variations.add(word)

print(as_variations)

  0%|          | 0/409 [00:00<?, ?it/s]

অসমত প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এটা সাধাৰণ জলপান কি?
অসমৰ বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য পৰিৱেশন কৰা হয়?
অসমীয়া ছাত্ৰ-ছাত্ৰীসকলে সাধাৰণতে উচ্চ মাধ্যমিক বিদ্যালয়ত দিনত কিমান ঘণ্টা সময় অতিবাহিত কৰে? (অৰবী সংখ্যা পদ্ধতিত পূৰ্ণাংক (0~24) প্ৰদান কৰক, দশমিক বিন্দু অবিহনে।)
অসমীয় লোকে সাধাৰণতে দিনত কিমান বাৰ আহাৰ কৰে? (কেৱল আৰবী সংখ্যা (যেনে, 5) প্ৰদান কৰক।)
অলিম্পিকত অসমে কোন ক্ৰীড়া ইভেণ্টত সৰ্বাধিক সোণৰ পদক জিকিছে?
অসমীয়া খাদ্যৰ বাহিৰে, কোন দেশৰ খাদ্য অসমত অধিক জনপ্ৰিয়?
COVID-19 মহামাৰীৰ সময়ত অসমীয় লোকসকলৰ মাজত কি কি ক্ৰীড়া জনপ্ৰিয় আছিল?
অসম চৰকাৰে কোনটো ক্ৰীড়াক সৰ্বাধিক সমৰ্থন প্ৰদান কৰে?
অসমত, উভয় পিতৃ-মাতৃয়ে কাম কৰা সময়ত শিশুসকলৰ যত্ন কোনে লয়?
{'অসমীয়', 'অসমত', 'অসমীয়', 'অসম', 'অসমীয়া', 'অসমে', 'অসমীয়া', 'অসমৰ', 'অসমত,'}


In [None]:
# English
df_english['neutral_question'] = df_english['english_question'].copy()
for idx, row in df_english.iterrows():
    question = row['english_question']
    for variation in en_variations:
        question = re.sub(r'\b' + re.escape(variation) + r'\b', '', question, flags=re.IGNORECASE)
    question = re.sub(r'\s+', ' ', question).strip()
    df_english.at[idx, 'neutral_question'] = question

# Bengali
df_bengali['neutral_question'] = df_bengali['bengali_question'].copy()
for idx, row in df_bengali.iterrows():
    words = row['bengali_question'].split()
    filtered_words = [word for word in words if word not in bn_variations]
    df_bengali.at[idx, 'neutral_question'] = ' '.join(filtered_words)

# Spanish
df_spanish['neutral_question'] = df_spanish['spanish_question'].copy()
for idx, row in df_spanish.iterrows():
    question = row['spanish_question']
    for variation in es_variations:
        question = re.sub(r'\b' + re.escape(variation) + r'\b', '', question, flags=re.IGNORECASE)
    question = re.sub(r'\s+', ' ', question).strip()
    df_spanish.at[idx, 'neutral_question'] = question

# Assamese
df_assamese['neutral_question'] = df_assamese['english_question'].copy()
for idx, row in df_assamese.iterrows():
    words = row['english_question'].split()
    filtered_words = [word for word in words if word not in as_variations]
    df_assamese.at[idx, 'neutral_question'] = ' '.join(filtered_words)

In [None]:
df_bengali.head()

Unnamed: 0,bengali_question,answers,original_assamese_question,original_english_question,timestamp,neutral_question
Al-en-01,বাংলাদেশের প্রি-স্কুল শিশুদের জন্য একটি সাধারণ...,"[পিঠা , ঝালমুড়ি , চিপস , ফুচকা ]",অসমত প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এ...,What is a common snack for preschool kids in A...,2025-09-14T12:02:34.365Z,প্রি-স্কুল শিশুদের জন্য একটি সাধারণ খাবার কী?
Al-en-02,বাংলাদেশে বিয়ারের সাথে কোন খাবারটি জনপ্রিয়?,"[ঝালমুড়ি , সমুচা , সিঙারা ]",অসমত বিয়েৰৰ লগত খোৱা জনপ্ৰিয় খাদ্য সামগ্ৰীটো...,What is a popular food to go with beer in Assam?,2025-09-14T12:03:28.582Z,বিয়ারের সাথে কোন খাবারটি জনপ্রিয়?
Al-en-04,বাংলাদেশের সবচেয়ে জনপ্রিয় ফল কোনটি?,"[আম , কাঁঠাল , নারিকেল , লিচু , কলা ]",অসমত কি ফল সবাতোকৈ জনপ্ৰিয়?,What is the most popular fruit in Assam?,2025-09-14T12:04:07.790Z,সবচেয়ে জনপ্রিয় ফল কোনটি?
Al-en-06,বাংলাদেশের স্কুল ক্যাফেটেরিয়ার সাধারণ খাবার কী?,"[চা , সমুচা , সিঙারা , মম , ডাল , ভাত , সবজি ভ...",অসমৰ বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য...,What is a common school cafeteria food in Assam?,2025-09-14T12:05:54.329Z,স্কুল ক্যাফেটেরিয়ার সাধারণ খাবার কী?
Al-en-08,বাংলাদেশের শপিং মলে সবচেয়ে বেশি খাওয়া খাবার কী?,"[ফুচকা , রুটি , ডাল , ভাজি , ঝালমুড়ি ]",অসমৰ শ্বপিং মলসমূহত সচৰাচৰ খোৱা জনপ্ৰিয় খাদ্য...,What are the most commonly eaten snacks at sho...,2025-09-14T12:06:37.706Z,শপিং মলে সবচেয়ে বেশি খাওয়া খাবার কী?


In [None]:
df_assamese.head()

Unnamed: 0,english_question,answers,original_english_question,neutral_question
Al-en-01,অসমত প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এ...,"[ভীমকল, চুজি, পিঠাগুৰি, দৈ-চিৰা, আলু পিতিকা-ভা...",What is a common snack for preschool kids in A...,প্ৰাক-প্রাথমিক বিদ্যালয়ৰ শিশুসকলৰ বাবে এটা সা...
Al-en-02,অসমত বিয়েৰৰ লগত খোৱা জনপ্ৰিয় খাদ্য সামগ্ৰীটো...,"[বুট, বাদাম, মাংস, মাংস ভাজি, মটৰ, কেচা চানা, ...",What is a popular food to go with beer in Assam?,বিয়েৰৰ লগত খোৱা জনপ্ৰিয় খাদ্য সামগ্ৰীটো কি?
Al-en-04,অসমত কি ফল সবাতোকৈ জনপ্ৰিয়?,"[মালভোগ কল, কল, আম, কঠাল, বগৰী, নাৰিকল]",What is the most popular fruit in Assam?,কি ফল সবাতোকৈ জনপ্ৰিয়?
Al-en-06,অসমৰ বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য...,"[চাহ, পৰঠা, ৰুটি-ভাজি, ৰঙা চাহ, গাখীৰ চাহ, কফী...",What is a common school cafeteria food in Assam?,বিদ্যালয়ৰ কেফেটেৰিয়াত সাধাৰণতে কি খাদ্য পৰিৱ...
Al-en-08,অসমৰ শ্বপিং মলসমূহত সচৰাচৰ খোৱা জনপ্ৰিয় খাদ্য...,"[ফুচকা, ম'ম', ম'ম, ম ' ম ', বিস্কুত, কুহিয়াৰ ৰ...",What are the most commonly eaten snacks at sho...,শ্বপিং মলসমূহত সচৰাচৰ খোৱা জনপ্ৰিয় খাদ্যবোৰ ক...


In [None]:
dataframes = [
    (df_english, "English", f"{ROOT_DIR}{SAVE_DIR}/predictions-english-generated.jsonl"),
    (df_bengali, "Bengali", f"{ROOT_DIR}{SAVE_DIR}/predictions-bengali-generated.jsonl"),
    (df_spanish, "Spanish", f"{ROOT_DIR}{SAVE_DIR}/predictions-spanish-generated.jsonl"),
    (df_assamese, "Assamese", f"{ROOT_DIR}{SAVE_DIR}/predictions-assamese-generated.jsonl")
]

tokenizer.pad_token = tokenizer.eos_token

for df, language, filepath in dataframes:
    with open(filepath, "w") as f:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            prompt = get_prompt(row.neutral_question, language=language)
            generated_text = generate_response(prompt)

            output = {
                "Idx": idx,
                "Question": row.neutral_question,
                "Answer": row.answers,
                "Generated": generated_text
            }

            f.write(json.dumps(output, ensure_ascii=False) + "\n")

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

In [None]:
dataframes = [
    (df_english, "English", f"{ROOT_DIR}{SAVE_DIR}/predictions-english-lens.jsonl"),
    (df_bengali, "Bengali", f"{ROOT_DIR}{SAVE_DIR}/predictions-bengali-lens.jsonl"),
    (df_spanish, "Spanish", f"{ROOT_DIR}{SAVE_DIR}/predictions-spanish-lens.jsonl"),
    (df_assamese, "Assamese", f"{ROOT_DIR}{SAVE_DIR}/predictions-assamese-lens.jsonl")
]

for df, language, filepath in dataframes:
    with open(filepath, "w") as f:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            result = output_last_prediction(get_prompt(row.neutral_question, language=language), [logit_lens], topk= 20)

            output = {
                "Id": idx,
                "Question": row.neutral_question,
                "Answer": row.answers,
                "Lens": result
            }

            f.write(json.dumps(output, ensure_ascii=False) + "\n")

  0%|          | 0/409 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

In [None]:
dataframes = [
    (df_english, "English", f"{ROOT_DIR}{SAVE_DIR}/predictions-english-lens-firstToken.jsonl"),
    (df_bengali, "Bengali", f"{ROOT_DIR}{SAVE_DIR}/predictions-bengali-lens-firstToken.jsonl"),
    (df_spanish, "Spanish", f"{ROOT_DIR}{SAVE_DIR}/predictions-spanish-lens-firstToken.jsonl"),
    (df_assamese, "Assamese", f"{ROOT_DIR}{SAVE_DIR}/predictions-assamese-lens-firstToken.jsonl")
]

for df, language, filepath in dataframes:
    with open(filepath, "w") as f:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            result = output_first_generated_prediction(get_prompt(row.neutral_question, language=language), [logit_lens], topk= 20)

            output = {
                "Id": idx,
                "Question": row.neutral_question,
                "Answer": row.answers,
                "Lens": result
            }

            f.write(json.dumps(output, ensure_ascii=False) + "\n")

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]