In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("../modules")

from pathlib import Path
from transformers import TrainingArguments
from trl import SFTTrainer

from common import constants
from common.data import finqa
from training import models

  from .autonotebook import tqdm as notebook_tqdm


## Load Datasets

In [4]:
ROOT_DATASET_DIR = Path("../dataset")

In [4]:
training_dataset = finqa.FinQADataset(
    data_path=ROOT_DATASET_DIR / "train.json", scope = constants.Scope.TRAINING
    ).to_huggingface()
training_dataset

                                                                 

Dataset({
    features: ['text'],
    num_rows: 6251
})

In [5]:
print(training_dataset[0]["text"])

### SYSTEM: You are a professional financial advisor. Your task is to read a financial report as text and numbers and do the proper math calculations to answer the given question.

### Human:
         ### START_FINANCIAL_REPORT
         ### PRE_TEXT:
         interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .
if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .
foreign currency exposure as more fully described in note 2i .
in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .
dollar-based exposures by entering into forward foreign currency exchange contracts .
the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .
currently , our largest foreign currency exposure is the euro 

In [6]:
validation_dataset = finqa.FinQADataset(
    data_path=ROOT_DATASET_DIR / "test.json", scope=constants.Scope.TRAINING
).to_huggingface()
validation_dataset

                                                                

Dataset({
    features: ['text'],
    num_rows: 1147
})

In [7]:
print(validation_dataset[0]["text"])

### SYSTEM: You are a professional financial advisor. Your task is to read a financial report as text and numbers and do the proper math calculations to answer the given question.

### Human:
         ### START_FINANCIAL_REPORT
         ### PRE_TEXT:
         entergy corporation and subsidiaries management 2019s financial discussion and analysis a result of the entergy louisiana and entergy gulf states louisiana business combination , results of operations for 2015 also include two items that occurred in october 2015 : 1 ) a deferred tax asset and resulting net increase in tax basis of approximately $ 334 million and 2 ) a regulatory liability of $ 107 million ( $ 66 million net-of-tax ) as a result of customer credits to be realized by electric customers of entergy louisiana , consistent with the terms of the stipulated settlement in the business combination proceeding .
see note 2 to the financial statements for further discussion of the business combination and customer credits .
re

## Train Model

In [8]:
import numpy as np
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    perplexity = np.exp(eval_pred.predictions.mean())
    
    return {"perplexity": perplexity}

In [9]:
model, tokenizer, peft_config = models.build_qlora_model(model_id="tiiuae/falcon-7b-instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.32s/it]


In [5]:
RESULT_DIR = Path("../results/finqa")
LOGGING_DIR = Path("../logs/finqa")

In [11]:
training_arguments = TrainingArguments(
    output_dir=str(RESULT_DIR),
    logging_dir=str(LOGGING_DIR),
    per_device_train_batch_size=3,  # increase this value if you have more VRAM
    gradient_accumulation_steps=6,
    per_device_eval_batch_size=3,  # increase this value if you have more VRAM
    optim="paged_adamw_32bit",  # This parameter activate QLoRa's pagination
    save_steps=40,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    evaluation_strategy="epoch",     # evaluation strategy
)

# TODO: Handle this error: "Token indices sequence length is longer than the specified maximum sequence length for this model (2302 > 2048). Running this sequence through the model will result in indexing errors"

In [12]:
model.config.use_cache = (
    False  # Gradient checkpointing is used by default but not compatible with caching
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
    compute_metrics=compute_metrics,
)
trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (2302 > 2048). Running this sequence through the model will result in indexing errors
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
# trainer.evaluate()

## Test Model

In [6]:
test_dataset = finqa.FinQADataset(
    data_path=ROOT_DATASET_DIR / "private_test.json", scope=constants.Scope.TESTING
).to_huggingface()
test_dataset

                                                   

Dataset({
    features: ['text'],
    num_rows: 919
})

In [7]:
test_dataset["text"][0]

"### SYSTEM: You are a professional financial advisor. Your task is to read a financial report as text and numbers and do the proper math calculations to answer the given question.\n\n### Human:\n         ### START_FINANCIAL_REPORT\n         ### PRE_TEXT:\n         entergy gulf states louisiana , l.l.c .\nmanagement 2019s financial discussion and analysis all debt and common and preferred equity/membership interest issuances by entergy gulf states louisiana require prior regulatory approval .\npreferred equity/membership interest and debt issuances are also subject to issuance tests set forth in its bond indentures and other agreements .\nentergy gulf states louisiana has sufficient capacity under these tests to meet its foreseeable capital needs .\nentergy gulf states louisiana 2019s receivables from the money pool were as follows as of december 31 for each of the following years: .\n         ### TABLE:\n         2011 | 2010 | 2009 | 2008\n( in thousands ) | ( in thousands ) | ( in th

In [9]:
loaded_model, loaded_tokenizer, _ = models.build_qlora_model(
    model_id="tiiuae/falcon-7b-instruct", peft_model_id=RESULT_DIR / "checkpoint-280"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.21s/it]


In [11]:
print(
    models.prompt(
        model=loaded_model,
        tokenizer=loaded_tokenizer,
        input_text=test_dataset["text"][0],
        max_new_tokens=100
    )
)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


### SYSTEM: You are a professional financial advisor. Your task is to read a financial report as text and numbers and do the proper math calculations to answer the given question.

### Human:
         ### START_FINANCIAL_REPORT
         ### PRE_TEXT:
         entergy gulf states louisiana, l.l.c.
management 2019s financial discussion and analysis all debt and common and preferred equity/membership interest issuances by entergy gulf states louisiana require prior regulatory approval.
preferred equity/membership interest and debt issuances are also subject to issuance tests set forth in its bond indentures and other agreements.
entergy gulf states louisiana has sufficient capacity under these tests to meet its foreseeable capital needs.
entergy gulf states louisiana 2019s receivables from the money pool were as follows as of december 31 for each of the following years:.
         ### TABLE:
         2011 | 2010 | 2009 | 2008
( in thousands ) | ( in thousands ) | ( in thousands ) | ( in th