In [10]:
from datasets import load_dataset

df = load_dataset("parquet", data_files="/kaggle/input/financial-qa/0000.parquet")

In [11]:
print(df)

DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 454234
    })
})


In [12]:
df_train_test = df['train'].train_test_split(test_size=0.05)
df_val_test = df_train_test['train'].train_test_split(test_size=0.05)

train_dataset = df_val_test["train"]
val_dataset = df_val_test["test"]
test_dataset = df_train_test["test"]

print(f"Train Dataset: {train_dataset.num_rows}")
print(f"Validation Dataset: {val_dataset.num_rows}")
print(f"Test Dataset: {test_dataset.num_rows}")

Train Dataset: 409945
Validation Dataset: 21577
Test Dataset: 22712


In [13]:
print(train_dataset.column_names)

['system', 'user', 'assistant']


In [14]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = [f"question: {q}" for q in examples["user"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(examples["assistant"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [15]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/409945 [00:00<?, ? examples/s]

Map:   0%|          | 0/21577 [00:00<?, ? examples/s]

In [37]:
from transformers import T5ForConditionalGeneration, Trainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import os

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
os.environ["WANDB_DISABLED"] = "false"

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_total_limit=2,
    logging_dir="./logs",
    eval_strategy="steps",
    eval_steps=2000,
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="steps",
    save_steps=2000,
    learning_rate=4e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    report_to="none",
    run_name="run 1"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

In [34]:
# import shutil
# import os

# # Path to the output directory
# folder_path = "/kaggle/working/t5_finetuned"

# # Remove the folder if it exists
# if os.path.exists(folder_path):
#     shutil.rmtree(folder_path)
#     print(f"Deleted {folder_path}")
# else:
#     print(f"{folder_path} does not exist.")


Deleted /kaggle/working/t5_finetuned


In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [39]:
trainer.train()



Step,Training Loss,Validation Loss
2000,1.0177,0.90503
4000,0.9396,0.855493
6000,0.8948,0.831168
8000,0.9221,0.814644
10000,0.873,0.804346
12000,0.8723,0.796109
14000,0.848,0.790042
16000,0.877,0.785454
18000,0.862,0.781721
20000,0.8584,0.779103


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=25622, training_loss=0.9067224126334456, metrics={'train_runtime': 14357.5368, 'train_samples_per_second': 28.553, 'train_steps_per_second': 1.785, 'total_flos': 5.548269480443904e+16, 'train_loss': 0.9067224126334456, 'epoch': 1.0})

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "/kaggle/working/t5_finetuned/checkpoint-25622"

trained_model = T5ForConditionalGeneration.from_pretrained(model_path)

def generate_answer(question):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    trained_model.to(device)  # Move model to the same device
    input_text = f"question: {question}"

    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()} 

    # Generate response using trained_model (not model)
    outputs = trained_model.generate(
        **inputs,
        max_length=128,
        do_sample=True, 
        temperature=0.7, 
        top_k=50,  
        top_p=0.9,  
        repetition_penalty=1.2,  
        no_repeat_ngram_size=4,  
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

question = "what is an investment?"
answer = generate_answer(question)
print("Answer:", answer)


In [48]:
import shutil

# Define your folder path
folder_path = "/kaggle/working/t5_finetuned/checkpoint-25622"

# Create a zip archive of the folder
shutil.make_archive("/kaggle/working/t5_finetuned", 'zip', folder_path)


'/kaggle/working/t5_finetuned.zip'