In [1]:
import os
from typing import Tuple

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import argparse

import pandas as pd
import pickle
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device="cuda:0"
model = AutoModelForCausalLM.from_pretrained('../../../models/llama3_8b_chat_hf/').to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained('../../../models/llama3_8b_chat_hf/')

Loading checkpoint shards: 100%|██████████| 4/4 [00:18<00:00,  4.58s/it]


In [3]:
def generate(tokenizer, model, prompt: str) -> Tuple[str, str]:
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
        last_token_logits = logits[0, -1, :]  # Logits for the last token
        masked_logits = last_token_logits.clone()
        top_unconst_token_id = torch.argmax(last_token_logits).item()
        top_unconst_token = tokenizer.decode([top_unconst_token_id])

    return top_unconst_token, top_unconst_token_id

## Prompting Cities

In [4]:
dataset_path = "../../datasets/cities_expansions/cities_questions_augmented_2.csv"
df_all = pd.read_csv(dataset_path)

with open('../../datasets/cities_expansions/countries_dictionary.pkl', 'rb') as f:
    country_names = pickle.load(f)

print(len(df_all))
df_all.head()

9984


Unnamed: 0.1,Unnamed: 0,statement,label,city,country,correct_country,question,question_with_answer
0,0,The city of Kostomloty Pierwsze is in Poland.,1,Kostomloty Pierwsze,Poland,Poland,In which country is the city of Kostomloty Pie...,In which country is the city of Kostomloty Pie...
1,1,The city of Bognankro is in Côte d'Ivoire.,1,Bognankro,Côte d'Ivoire,Côte d'Ivoire,In which country is the city of Bognankro loca...,In which country is the city of Bognankro loca...
2,2,The city of Le Raincy is in France.,1,Le Raincy,France,France,In which country is the city of Le Raincy loca...,In which country is the city of Le Raincy loca...
3,3,The city of Tobol is in Kazakhstan.,1,Tobol,Kazakhstan,Kazakhstan,In which country is the city of Tobol located?,In which country is the city of Tobol located?...
4,4,The city of Tayabas Ibaba is in Philippines.,1,Tayabas Ibaba,Philippines,Philippines,In which country is the city of Tayabas Ibaba ...,In which country is the city of Tayabas Ibaba ...


In [5]:
for j in range(3, 10):
    print(j)
    df = df_all[j*1000:(j+1)*1000]
    answer_list, questions = [], []
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if row['label'] == 1:
            question = "End your answer with the sentence 'I finished'. Do not consider England, Scotland, Wales and Northern Ireland countries, the country is the UK. Here are some examples of questions that might help you. In which country is the city of Barcelona located? Spain. I finished. In which country is the city of Berlin located? Germany. I finished. " + row["question"]
            answer = ''
            
            for i in range(1000):
                token_decoded, token_id = generate(tokenizer, model, question)
                if "I finished" in answer:
                    break

                question = question + token_decoded
                answer = answer + token_decoded
                
            answer_list.append(answer.strip())
            questions.append(question)
            #print(f"{question=}")
        else:
            answer_list.append(None)
            questions.append(None)

        
    df['answer'] =  answer_list
    df['filtered_answer'] = df['answer'].str.slice(stop=-12)
    df['is_correct'] = df.apply(lambda row : row['filtered_answer'] in country_names[row['correct_country']], axis=1) #df['is_correct'] = (df['filtered_answer'] == df['correct_country'])
    print('Accuracy: ' + str(df['is_correct'].sum()/df['label'].sum()))
    df.to_csv("../../datasets/cities_expansions/cities_questions_augmented_2_labeled_" + str(j) + ".csv")

2


  0%|          | 0/1000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
 55%|█████▍    | 546/1000 [07:50<06:31,  1.16it/s]


KeyboardInterrupt: 

In [None]:
df.to_csv("../../datasets/cities_expansions/cities_questions_augmented_2_labeled.csv")

## Prompting birth years

In [3]:
dataset_path = "../datasets/custom/birth_years.csv"
df_all = pd.read_csv(dataset_path)

#with open('../../datasets/cities_expansions/countries_dictionary.pkl', 'rb') as f:
#    country_names = pickle.load(f)

print(len(df_all))
df_all.head()

4430


Unnamed: 0,question,answer,filtered_answer,correct_answer,is_correct
0,What year was Hank Aaron born?,-,-,1934,-
1,What year was Sani Abacha born?,-,-,1943,-
2,What year was Claudio Abbado born?,-,-,1933,-
3,What year was Mahmoud Abbas born?,-,-,1935,-
4,What year was Omar Abdel Rahman born?,-,-,1938,-


In [None]:
question = ""
answer = ''

for i in range(1000):
    token_decoded, token_id = generate(tokenizer, model, question)
    if "I finished" in answer:
        break

    question = question + token_decoded
    answer = answer + token_decoded

In [None]:
answer_list, questions = [], []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    if row['label'] == 1:
        question = "End your answer with the sentence 'I finished'. Here are some examples of question that might help you. What year was Claudio Abbado born? 1933. I finished. What year was Lionel Messi born?. 1987. I finished. " + row["question"]
        answer = ''
        
        for i in range(1000):
            token_decoded, token_id = generate(tokenizer, model, question)
            if "I finished" in answer:
                break

            question = question + token_decoded
            answer = answer + token_decoded
            
        answer_list.append(answer.strip())
        questions.append(question)
        #print(f"{question=}")
    else:
        answer_list.append(None)
        questions.append(None)

    
df['answer'] =  answer_list
df['filtered_answer'] = df['answer'].str.slice(stop=-12)
df['is_correct'] = df.apply(lambda row : row['filtered_answer'] in country_names[row['correct_country']], axis=1) #df['is_correct'] = (df['filtered_answer'] == df['correct_country'])
print('Accuracy: ' + str(df['is_correct'].sum()/df['label'].sum()))
df.to_csv("../../datasets/cities_expansions/cities_questions_augmented_2_labeled_" + str(j) + ".csv")