## Importing all the required libraries

In [1]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
from datasets import load_from_disk
os.environ['TRANSFORMERS_CACHE'] = '/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache'



## Loading the val dataset

In [2]:
data_feedback = load_from_disk("../datasets/gsm8k/feedback/")
data_feedback

Dataset({
    features: ['question', 'answer'],
    num_rows: 5082
})

In [3]:
data_train = load_from_disk("../datasets/gsm8k/train/")
data_train

Dataset({
    features: ['question', 'answer'],
    num_rows: 896
})

## Loading the model

In [19]:
# from enum import Enum

# class DefaultToken(Enum):
#     PAD_TOKEN = "[PAD]"
#     EOS_TOKEN = "</s>"
#     BOS_TOKEN = "<s>"
#     UNK_TOKEN = "<unk>"
#     IGNORE_INDEX = -100

In [18]:

# model_name = "meta-llama/Llama-3.2-3B-Instruct"
# config = AutoConfig.from_pretrained(model_name, token=hf_token)
# tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, config=config,cache_dir='/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache', model_max_length=256,padding_side="right")
# special_tokens = dict()
# if tokenizer.pad_token is None:
#     special_tokens["pad_token"] = DefaultToken.PAD_TOKEN.value
# if tokenizer.eos_token is None:
#     special_tokens["eos_token"] = DefaultToken.EOS_TOKEN.value
# if tokenizer.bos_token is None:
#     special_tokens["bos_token"] = DefaultToken.BOS_TOKEN.value
# if tokenizer.unk_token is None:
#     special_tokens["unk_token"] = DefaultToken.UNK_TOKEN.value

# num_new_tokens = tokenizer.add_special_tokens(special_tokens)


In [4]:
hf_token = os.getenv("hf_token")
model_name = "meta-llama/Llama-3.2-3B-Instruct"
config = AutoConfig.from_pretrained(model_name, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, config=config,cache_dir='/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache')
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, config=config,cache_dir='/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
# model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

device(type='cuda')

In [5]:
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [6]:
# def create_prompt(question):
#     prompt="Answer the below question: \n\n Question:\n\n"+question
#     return prompt

In [9]:
# import random
# def create_fs_prompt(question,n,x):
#     '''
#     question: The question we want the model to answer
#     n: The number of examples for few shot
#     x: The data from which we get the few shot examples.
#     '''
#     prompt=""
#     for i in range(n):
#         ind=random.randint(0, len(x))
#         prompt+="Question: "+x["question"][ind]+"\n\nAnswer: "+x["answer"][ind]+"\n\n"
#     prompt+="Question: "+question
#     return prompt
    
        
        
    

In [7]:
fs8 = "Examples:\n"

for i in range(9):
        fs8+="Q: "+data_train["question"][i]+"\nA: "+data_train["answer"][i]+"\n"

In [24]:
# import random
# def create_fs_prompt(question,n,x):
#     '''
#     question: The question we want the model to answer
#     n: The number of examples for few shot
#     x: The data from which we get the few shot examples.
#     '''
#     prompt=""
#     ind=random.randint(0, len(x)-9)
#     for i in range(ind,ind+9):
        
#         prompt+="Question: "+x["question"][i]+"\n\nAnswer: "+x["answer"][i]+"\n\n"
#     prompt+="Question: "+question
#     return prompt
    
        
        
    

In [8]:
# import random
# def create_fs_prompt(question,fs8):
#     '''
#     question: The question we want the model to answer
#     n: The number of examples for few shot
#     x: The data from which we get the few shot examples.
#     '''
#     prompt=""+fs8
#     prompt+="Question: "+question
#     return prompt
    
        
        
    

In [9]:
# # input_text = create_fs_prompt(data_feedback["question"][0],5,data_train)
# input_text = create_fs_prompt(data_feedback["question"][0],fs8)

In [10]:
# print(input_text)

In [11]:
# input_text=""+fs8
# input_text+="Question: "+data_feedback["question"][9]


In [25]:
import json
from tqdm import tqdm
model.eval()
generated_outputs=[]
cnt=0
batch_size=8 # Adjust batch size according to your GPU memory capacity
for i in tqdm(range(0, len(data_feedback["question"]), batch_size), desc="Processing questions"):
    batch_questions = data_feedback["question"][i:i+batch_size]
    inputs = [fs8+"Q: "+q+"\nA: " for q in batch_questions]
    tokenized_inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
    tokenized_inputs.to(device)

    with torch.no_grad():
        # output = model.generate(**tokenized_inputs, max_length=2000, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95, max_new_tokens=512)
        output = model.generate(**tokenized_inputs, max_new_tokens=256, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95)
   
    for j, o in enumerate(output):
        cnt+=1
        generated_text = tokenizer.decode(o, skip_special_tokens=True)
        # print(generated_text,"\n________________________________________\n\n")
        lines=generated_text.splitlines()
        linenumber = [i for i, line in enumerate(lines) if line.startswith("Q: ")][-1]
        answer = '\n'.join(lines[linenumber:])
        print(answer)
        generated_outputs.append({"input": inputs[j], "output": generated_text})
        break
    break


Processing questions:   0%|          | 0/636 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing questions:   0%|          | 0/636 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 392.00 MiB. GPU 0 has a total capacity of 44.34 GiB of which 20.19 MiB is free. Including non-PyTorch memory, this process has 44.31 GiB memory in use. Of the allocated memory 43.10 GiB is allocated by PyTorch, and 926.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(inputs[0])

In [None]:
lines

In [18]:
import json
from tqdm import tqdm
model.eval()
generated_outputs=[]
batch_size=8
for i in tqdm(range(len(data_feedback["question"]),batch_size), desc="Processing questions"):
    # Create the prompt
    batch_questions = data_feedback["question"][i:i+batch_size]
    input_text = [fs8+"Question: "+q for q in batch_questions]
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt")
    inputs.to(device)
    # Generate output
    with torch.no_grad():
        output = model.generate(**inputs, max_length=2000, num_return_sequences=1,  pad_token_id=tokenizer.pad_token_id)
    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    lines=generated_text.splitlines()
    linenumber = [i for i, line in enumerate(lines) if line.startswith("Question: ")][-1]
    answer = lines[linenumber:].join('\n')
    print(answer)
    # generated_outputs.append({"input":input_text,"output":})
    break

# with open("../outputs/gsm8k/generated_outputs.json", "w") as f:
#     json.dump(generated_outputs, f, indent=4)

    
    

Processing questions: 0it [00:00, ?it/s]


In [16]:
# input_text = create_prompt(data["question"][0])
# input_text
len(data_feedback["question"])

5082

In [3]:
data["answer"][0]

'The total revenue from the adults is 183 x $26 = $<<183*26=4758>>4758.\nThe ticket price for children is $26/2 = $<<26/2=13>>13.\nThe total revenue from the children is 28 x $13 = $<<28*13=364>>364.\nThe total revenue of the concert is $4758 + $364 = $<<4758+364=5122>>5122\n#### 5122'