In [1]:
import sys
sys.path.append("/home/ubuntu/pytorch_training/10_rag/src")

In [3]:

# https://www.datacamp.com/tutorial/fine-tuning-llama-3-1
import torch
from transformers import AutoModelForCausalLM, pipeline, BitsAndBytesConfig

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers
from data.q_and_a.tokenized import TokenizedCausal
from data.q_and_a.prompted import Prompted, to_transformers_dataset, index_to_answer
from q_and_a.prompts import prompt_with_answer
from models_.building.llama_tokenizer import load_tokenizer
from tqdm import tqdm

In [4]:
train = TrainAndEval("../../data/pubmed_QA_train.json")
with_answers = EvalWithAnswers(train)
prompted = Prompted(with_answers, prompt_with_answer)

#train_data = to_transformers_dataset(prompted)

test = TrainAndEval("../../data/pubmed_QA_eval.json")
test_with_answers = EvalWithAnswers(test)
prompted_test = Prompted(test_with_answers, prompt_with_answer)
#test_data = to_transformers_dataset(prompted_test)

In [5]:
# prompted[0]["text"] == prompted["text"][0]

In [6]:
tokenizer = load_tokenizer()

In [7]:
train_data_tokenized = TokenizedCausal(tokenizer, prompted)
test_data_tokenized = TokenizedCausal(tokenizer, prompted_test)

In [8]:
train_data_tokenized[0]

{'input_ids': tensor([128000,   2675,    527,  ..., 128001, 128001, 128001]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'labels': tensor([  2675,    527,    459,  ..., 128001, 128001,   -100])}

In [11]:
tokenizer.decode(train_data_tokenized[0]["input_ids"])

'<|begin_of_text|>You are an expert at answering multiple-choice questions. Given the context below, carefully read the question and select the single best answer from the options provided.\n\nContext:\nThe rate of action of calcium on the electrical and mechanical responses of the crayfish muscle fibers. The effects of sudden changes in external Ca concentration on the time courses of the changes in size of the action potential and of the associated contraction in a single crayfish muscle fiber were investigated. Procaine-HCl was added to the bathing solution to make the muscle fiber excitable. The concentration of the divalent cations (Ca and Mg) was high enough to keep the threshold potential constant. In Ca-free solution, neither action potential nor contraction was observed. When the external Ca concentration was suddenly increased from 0 to 14 mM, the full sized action potentials were generated within several seconds, but the tensions recovered slowly in an exponential time cours

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

In [13]:
from huggingface_hub import login
login("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")

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

tokenizer = load_tokenizer()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

Device set to use cuda:0


In [28]:
def predict(test, model, tokenizer):
    y_pred = []
    text_generated = []
    categories = ["A", "B", "C", "D"]

    for i in tqdm(range(len(test))):
        prompt = test[i]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("The correct answer is:")[-1].strip()
        text_generated.append(answer)
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred, text_generated

In [29]:
y_pred, answers = predict(train_data.select(range(100)), model, tokenizer)

NameError: name 'train_data' is not defined

In [28]:
answers

['C. The',
 'D. In',
 'B. Eight',
 'D. In',
 'C.',
 'C. The',
 'C. Long',
 'A. Str',
 'D. Th',
 'A. The',
 'B. In',
 'A',
 'A. Bac',
 'D.',
 'A. A',
 'C. The',
 'C. A',
 'D. A',
 'A',
 'D. T',
 'C. A',
 'B. A',
 'C. Dep',
 'C. The',
 'C. Hern',
 'A.',
 'A',
 'B. Th',
 'A. In',
 'B. C',
 'B. A',
 'A',
 'D. The',
 'B',
 'A. A',
 'C. The',
 'B. In',
 'C. A',
 'C. Patients',
 'C. Retro',
 'D. The',
 'C. Dip',
 'C. Three',
 'C. After',
 'B. From',
 'C. A',
 'D. Post',
 'A. Th',
 'D. Hist',
 'C. The',
 'C. The',
 'C. B',
 'B. Pro',
 'C. The',
 'A',
 'A. Ph',
 'B. Hal',
 'B',
 'D. A',
 'B. A',
 'D. A',
 'A. Three',
 'A. A',
 'B',
 'A. Two',
 'B. N',
 'C. Sup',
 'C. Pregnancy',
 'D.',
 'C',
 'B. In',
 'A',
 'A. HB',
 'B. Partial',
 'A. A',
 'D. Ser',
 'B',
 'B',
 'C. During',
 'B. Mon',
 'C. Surgical',
 'A. Men',
 'B. The',
 'B. The',
 'C. Two',
 'A. Children',
 'D. Cam',
 'B. In',
 'B. Bio',
 'C. The',
 'D. The',
 'D. In',
 'D. A',
 'D. A',
 'B. The',
 'B. Approximately',
 'D. The',
 'C. Heat

In [29]:
y_pred

['C',
 'D',
 'B',
 'D',
 'C',
 'C',
 'C',
 'A',
 'D',
 'A',
 'B',
 'A',
 'A',
 'D',
 'A',
 'C',
 'A',
 'A',
 'A',
 'D',
 'A',
 'A',
 'C',
 'C',
 'C',
 'A',
 'A',
 'B',
 'A',
 'B',
 'A',
 'A',
 'D',
 'B',
 'A',
 'C',
 'B',
 'A',
 'A',
 'C',
 'D',
 'C',
 'C',
 'A',
 'B',
 'A',
 'D',
 'A',
 'D',
 'C',
 'C',
 'B',
 'B',
 'C',
 'A',
 'A',
 'A',
 'B',
 'A',
 'A',
 'A',
 'A',
 'A',
 'B',
 'A',
 'B',
 'C',
 'A',
 'D',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'D',
 'B',
 'B',
 'C',
 'B',
 'A',
 'A',
 'B',
 'B',
 'C',
 'A',
 'A',
 'B',
 'B',
 'C',
 'D',
 'D',
 'A',
 'A',
 'B',
 'A',
 'D',
 'A',
 'A',
 'C']

In [30]:
y_pred, answers = predict(train_data.select(range(100)), model, tokenizer)
y_true = []
for i in range(len(y_pred)):
    y_true.append(train_data[i]["label"])

In [31]:
y_true

['A',
 'D',
 'C',
 'B',
 'D',
 'D',
 'D',
 'C',
 'A',
 'C',
 'C',
 'C',
 'A',
 'D',
 'A',
 'B',
 'C',
 'A',
 'D',
 'D',
 'C',
 'A',
 'A',
 'C',
 'B',
 'D',
 'D',
 'C',
 'A',
 'B',
 'A',
 'D',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'D',
 'D',
 'B',
 'D',
 'B',
 'D',
 'A',
 'C',
 'A',
 'C',
 'D',
 'B',
 'C',
 'A',
 'D',
 'A',
 'C',
 'C',
 'A',
 'A',
 'B',
 'A',
 'B',
 'B',
 'D',
 'D',
 'C',
 'A',
 'C',
 'A',
 'A',
 'A',
 'D',
 'D',
 'A',
 'B',
 'B',
 'D',
 'A',
 'A',
 'C',
 'A',
 'B',
 'C',
 'B',
 'D',
 'C',
 'D',
 'B',
 'C',
 'B',
 'A',
 'A',
 'C',
 'B',
 'A',
 'A',
 'A',
 'C',
 'B']

In [25]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def evaluate(y_true, y_pred):
    labels = ["A", "B", "C", "D"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(y_true, y_pred)

NameError: name 'y_true' is not defined

In [16]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

['k_proj', 'o_proj', 'up_proj', 'gate_proj', 'down_proj', 'v_proj', 'q_proj']

In [17]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrices (lower = less memory)
    lora_alpha=16,
    target_modules=modules,  # Depends on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 5,636,096 || all params: 1,241,450,496 || trainable%: 0.4540


In [19]:
from torch.utils.data import Subset

train_tokenized_subset = Subset(train_data_tokenized, range(0, 200))
test_tokenized_subset = Subset(test_data_tokenized, range(0, 20))

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    #train_dataset=train_data_tokenized,
    #eval_dataset=test_data_tokenized,
    train_dataset = Subset(train_data_tokenized, range(0, 200)),
    eval_dataset = Subset(test_data_tokenized, range(0, 20)),
    tokenizer=tokenizer,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  arr = np.array(obj)
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss
1,1.4787,1.068104
2,0.8956,0.843888
3,0.7696,0.777242
4,0.719,0.74738
5,0.69,0.730565
6,0.6685,0.719999


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


KeyboardInterrupt: 

In [30]:
y_pred, answers = predict(Subset(prompted_test, range(100)), model, tokenizer)
y_true = []
for i in range(len(y_pred)):
    y_true.append(train_data[i]["label"])

Device set to use cuda:0                                                                                                                                                                   | 0/100 [00:00<?, ?it/s]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneratio

TypeError: can only concatenate str (not "tuple") to str

# First, load the data

We are going to load the data used for train or modify our classification task.

In [3]:
class Tokenized(Dataset):
    def __init__(self, tokenizer, dataset: Prompted, max_length=2000):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        text, answer = self.dataset[idx]

        result = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",)
        labels = torch.tensor(answer, dtype=torch.long)

        return {
            "input_ids": result["input_ids"].squeeze(0),
            "attention_mask": result["attention_mask"].squeeze(0),
            "labels": labels,
        }

In [4]:
MODEL_NAME = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token

train_dataset = TrainAndEval("../../data/pubmed_QA_train.json")
test_dataset = TrainAndEval("../../data/pubmed_QA_eval.json")
train_with_answers = EvalWithAnswers(train_dataset)
test_with_answers = EvalWithAnswers(test_dataset)
train_prompted= Prompted(train_with_answers, prompt)
test_prompted = Prompted(test_with_answers, prompt)
train_tokenized = Tokenized(tokenizer, train_prompted)
test_tokenized = Tokenized(tokenizer, test_prompted)

In [5]:
len(train_tokenized), len(test_tokenized)

(16890, 5000)

In [6]:
# per now use a subset
from torch.utils.data import Subset

train_tokenized = Subset(train_tokenized, range(0, 2000))
test_tokenized = Subset(test_tokenized, range(0, 200))

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4,
    load_in_8bit=True,
    pad_token_id=tokenizer.pad_token_id,
)
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)


In [8]:
for name, param in model.named_parameters():
    if "score" not in name:
        print(f"grad non required on:{name}")
        param.requires_grad = False
    else:
        print(f"requires grad: {name}")

grad non required on:model.embed_tokens.weight
grad non required on:model.layers.0.self_attn.q_proj.weight
grad non required on:model.layers.0.self_attn.k_proj.weight
grad non required on:model.layers.0.self_attn.v_proj.weight
grad non required on:model.layers.0.self_attn.o_proj.weight
grad non required on:model.layers.0.mlp.gate_proj.weight
grad non required on:model.layers.0.mlp.up_proj.weight
grad non required on:model.layers.0.mlp.down_proj.weight
grad non required on:model.layers.0.input_layernorm.weight
grad non required on:model.layers.0.post_attention_layernorm.weight
grad non required on:model.layers.1.self_attn.q_proj.weight
grad non required on:model.layers.1.self_attn.k_proj.weight
grad non required on:model.layers.1.self_attn.v_proj.weight
grad non required on:model.layers.1.self_attn.o_proj.weight
grad non required on:model.layers.1.mlp.gate_proj.weight
grad non required on:model.layers.1.mlp.up_proj.weight
grad non required on:model.layers.1.mlp.down_proj.weight
grad non

In [9]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrices (lower = less memory)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Depends on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    modules_to_save=["score"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 860,160 || all params: 1,236,682,752 || trainable%: 0.0696


In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 5.65 GiB of which 64.00 MiB is free. Including non-PyTorch memory, this process has 5.56 GiB memory in use. Of the allocated memory 5.39 GiB is allocated by PyTorch, and 63.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [11]:
print("CUDA available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))
print("Supports FP16:", torch.cuda.get_device_capability(0))

CUDA available: True
GPU Name: NVIDIA H100 80GB HBM3
Supports FP16: (9, 0)


In [12]:
trainer.save_model("./last-checkpoint/trainer")
tokenizer.save_pretrained("./last-checkpoint/tokenizer")

('./last-checkpoint/tokenizer/tokenizer_config.json',
 './last-checkpoint/tokenizer/special_tokens_map.json',
 './last-checkpoint/tokenizer/tokenizer.json')

In [13]:
model.save_pretrained("./last-checkpoint/model")

In [12]:
import torch
from pathlib import Path

from transformers import AutoModelForSequenceClassification,AutoTokenizer

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers

from models_.building.llama_tokenizer import  load_tokenizer

from data.pubmed.from_json import FromJsonDataset
from data.pubmed.contents import ContentsDataset

from storage.faiss_ import FaissStorage

from rag.tokenization.llama import build_tokenizer_function
from rag.quering import build_querier
import os
from q_and_a.forward import build_enhanced_forwarder
from q_and_a.prompts import prompt
from q_and_a.picking.from_logits import build_from_logits
from q_and_a.eval import evaluate
from q_and_a.forward import build_forwarder

train = TrainAndEval("../../data/pubmed_QA_train.json")
evaluationData = TrainAndEval("../../data/pubmed_QA_eval.json")
evaluateWithAnswers = EvalWithAnswers(evaluationData)

augmented_data = FromJsonDataset(json_file="../../data/pubmed_500K.json")
augmented_data = ContentsDataset(augmented_data)

from huggingface_hub import notebook_login
notebook_login()

storage = FaissStorage(
    dimension=800,
)

storage.load("../../outputs/store/pubmed_500K.index")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
tokenizer_rag = load_tokenizer()
tokenizer_fn = build_tokenizer_function(tokenizer_rag)

querier = build_querier(storage, augmented_data, tokenizer_fn)
storage = FaissStorage(
    dimension=800,
)

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
model.eval()
forward = build_forwarder(
    model,
    tokenizer,
    querier,
    k_augmentations=1,
    prompt_builder=prompt,
    device=device,
)

forward_and_get_arg_max = lambda question, options: forward(
    question,
    options=options,
)

def pick_from_classifier(out):
    return torch.argmax(out.logits[0])

accuracy = evaluate(
    forward_fn=forward_and_get_arg_max,
    picker_fn=pick_from_classifier,
    eval_dataset=evaluateWithAnswers,
)

print(f"Accuracy: {accuracy:.2f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacity of 5.65 GiB of which 8.00 MiB is free. Including non-PyTorch memory, this process has 5.62 GiB memory in use. Of the allocated memory 5.44 GiB is allocated by PyTorch, and 67.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [19]:
from q_and_a.predict import predict
from data.q_and_a.test_questions import TestQuestions

In [20]:
test_data = TestQuestions("../../data/pubmed_QA_test_questions.json")

In [21]:
responses = predict(
    forward_fn=forward_and_get_arg_max,
    picker_fn=pick_from_classifier,
    eval_dataset=test_data,
)

responses

Processed 0.1%
Processed 0.2%
Processed 0.3%
Processed 0.4%
Processed 0.5%
Processed 0.6%
Processed 0.7%
Processed 0.8%
Processed 0.9%


[(0, tensor(1, device='cuda:0')),
 (1, tensor(3, device='cuda:0')),
 (2, tensor(2, device='cuda:0')),
 (3, tensor(1, device='cuda:0')),
 (4, tensor(2, device='cuda:0')),
 (5, tensor(1, device='cuda:0')),
 (6, tensor(2, device='cuda:0')),
 (7, tensor(2, device='cuda:0')),
 (8, tensor(1, device='cuda:0')),
 (9, tensor(0, device='cuda:0')),
 (10, tensor(3, device='cuda:0')),
 (11, tensor(3, device='cuda:0')),
 (12, tensor(2, device='cuda:0')),
 (13, tensor(3, device='cuda:0')),
 (14, tensor(3, device='cuda:0')),
 (15, tensor(1, device='cuda:0')),
 (16, tensor(3, device='cuda:0')),
 (17, tensor(2, device='cuda:0')),
 (18, tensor(0, device='cuda:0')),
 (19, tensor(2, device='cuda:0')),
 (20, tensor(1, device='cuda:0')),
 (21, tensor(3, device='cuda:0')),
 (22, tensor(2, device='cuda:0')),
 (23, tensor(1, device='cuda:0')),
 (24, tensor(1, device='cuda:0')),
 (25, tensor(1, device='cuda:0')),
 (26, tensor(3, device='cuda:0')),
 (27, tensor(0, device='cuda:0')),
 (28, tensor(2, device='cuda:0

In [22]:
import pandas as pd

In [27]:
responses_with_ids = []

for i in range(len(responses)):
    responses_with_ids.append((test_data[i]["id"], responses[i][1].item()))

In [28]:
dataset = pd.DataFrame(responses_with_ids, columns=["ID", "answer"])

In [29]:
dataset.head()

Unnamed: 0,ID,answer
0,26,1
1,29,3
2,37,2
3,70,1
4,109,2


In [30]:
dataset.to_csv("predictions.csv", index=False)