Testing with FINQA dataset

In [1]:
# 2. Download the raw data files from the official GitHub repository
!wget https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/train.json
!wget https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/dev.json

# 3. Load the JSON files into a Pandas DataFrame
import pandas as pd
import json
from datasets import Dataset

try:
    with open('train.json', 'r') as f:
        train_data = json.load(f)

    # Load the data into a DataFrame
    train_df = pd.DataFrame(train_data)

    # Optional: Take a subset of the data for your minimal experiment
    subset_df = train_df.head(500)

    print(f"Successfully loaded a subset of size {len(subset_df)} into a DataFrame.")

    # 4. Preprocess and Format the Data
    # The keys in the dataframe are: id, pre_text, post_text, table, qa
    # The question is in row['qa']['question']
    # The answer is in row['qa']['exe_ans']

    formatted_data = []
    for _, row in subset_df.iterrows():
        question = row['qa']['question']
        answer = row['qa']['exe_ans']

        # Format the data into the instruction prompt template
        text = f"### Question:\n{question}\n\n### Answer:\n{answer}"
        formatted_data.append({"text": text})

    # 5. Convert the Pandas DataFrame to a Hugging Face Dataset
    formatted_df = pd.DataFrame(formatted_data)
    finqa_dataset = Dataset.from_pandas(formatted_df)

    # 6. Split the dataset into a small train and test set
    finqa_dataset_splits = finqa_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = finqa_dataset_splits['train']
    eval_dataset = finqa_dataset_splits['test']

    print(f"Successfully formatted and split the dataset. Training on {len(train_dataset)} examples.")
    print(train_dataset[0]['text'])

except Exception as e:
    print(f"An error occurred: {e}")

--2025-08-14 11:51:17--  https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78216616 (75M) [text/plain]
Saving to: ‘train.json’


2025-08-14 11:51:26 (21.9 MB/s) - ‘train.json’ saved [78216616/78216616]

--2025-08-14 11:51:26--  https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10954658 (10M) [text/plain]
Saving to: ‘dev.json’


2025-08-14 11:51:27 (21.6 MB/s) - ‘dev.json’ saved [10954658/1095465

  from .autonotebook import tqdm as notebook_tqdm


Successfully loaded a subset of size 500 into a DataFrame.
Successfully formatted and split the dataset. Training on 450 examples.
### Question:
what was the total amount of unfunded commitments in millions as of the end of 2008 and 2007?

### Answer:
290.0


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-hf"  # Or Llama-3-8b if you have access

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model_qlora = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,  # For QLoRA (set to False for standard LoRA/DoRA)
    device_map="auto",
    torch_dtype="auto"
)

model_qlora.config.pad_token_id = tokenizer.pad_token_id

print("Model and tokenizer loaded successfully.")

#model_ = AutoModelForCausalLM.from_pretrained(
 #   model_name,
  #  load_in_4bit=False,  # For QLoRA (set to False for standard LoRA/DoRA)
   # device_map="auto"
#)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
        #return_tensors="pt"
    )

tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

print("Tokenization complete.")

In [None]:


"""lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,  # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"]  # Most common for LLaMA
)

model = get_peft_model(model_, lora_config)"""



In [None]:
"""from peft import DORAConfig

dora_config = DORAConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model_, dora_config)"""


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

qlora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,  # Rank
    lora_alpha=8,
    lora_dropout=0.05,
    max_length = 256,
    target_modules=["q_proj"]  # Most common for LLaMA, "v_proj"
)

model = get_peft_model(model_qlora, qlora_config)
print("PEFT model created.")

In [None]:
from transformers import TrainingArguments, Trainer


# Training arguments
training_args = TrainingArguments(
    output_dir="./llama-peft-finetuned",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    save_strategy="steps",
    num_train_epochs=1,
    logging_steps=50,
    save_steps=500,
    learning_rate=2e-4,
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval
)

trainer.train()
print("Training complete.")


In [None]:
model.save_pretrained("./llama_peft_weights")
tokenizer.save_pretrained("./llama_peft_weights")


In [None]:
from peft import merge_and_unload
merged_model = merge_and_unload(model)
merged_model.save_pretrained("./llama_finetuned_full")
