In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

In [2]:
from huggingface_hub import login, notebook_login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/liam/.cache/huggingface/token
Login successful


In [3]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from datasets import Dataset
import os

# get all .md files in the statements/ directory
md_files = [os.path.join('statements', fn) for fn in os.listdir('statements') if fn.endswith('.md')]

# create loaders for each .md file
loaders = [UnstructuredMarkdownLoader(fn) for fn in md_files]

all_documents = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    md_doc = loader.load()
    updated_md_doc = filter_complex_metadata(md_doc)
    # Extract text content from each document
    text_content = [doc.page_content for doc in updated_md_doc]
    all_documents.extend(text_content)

# Create a Hugging Face dataset
training_set = Dataset.from_dict({"text": all_documents})

print(f"Number of documents: {len(training_set)}")

Loading raw document...statements/DUK_10-Q.md
Loading raw document...statements/META_10-K.md
Loading raw document...statements/K_10-K.md
Loading raw document...statements/MMM_10-K.md
Loading raw document...statements/AXP_10-K.md
Loading raw document...statements/JNJ_10-K.md
Loading raw document...statements/GM_10-K.md
Loading raw document...statements/ADBE_10-Q.md
Loading raw document...statements/CL_10-K.md
Loading raw document...statements/NVDA_10-Q.md
Loading raw document...statements/MSFT_10-Q.md
Loading raw document...statements/F_10-Q.md
Loading raw document...statements/NEE_10-K.md
Loading raw document...statements/VZ_10-K.md
Loading raw document...statements/AMZN_10-Q.md
Loading raw document...statements/CSCO_10-Q.md
Loading raw document...statements/AAPL_10-K.md
Loading raw document...statements/SCHW_10-Q.md
Loading raw document...statements/WFC_10-K.md
Loading raw document...statements/BA_10-K.md
Loading raw document...statements/JPM_10-K.md
Loading raw document...statements/

In [4]:
system_message = """
You are an assistant that summarizes and extracts key figures from financial statements. 
Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner, 
focusing on the most important figures and datapoints.

Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers. here is the context:

{context}"""

# Define the create_conversation function
def create_conversation(sample):
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": sample["text"]}
        ]
    }

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  torch_dtype=torch.float16,
  low_cpu_mem_usage=True,
  device_map="cuda",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "right"

tokenizer.pad_token = tokenizer.eos_token

  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'mode

In [6]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)

In [7]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="llama3.1-8b-instruct-int4-edgar-summarization",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjusted for single GPU
    gradient_accumulation_steps=12,  # Adjusted to maintain effective batch size
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    push_to_hub=True,
    report_to="tensorboard",
)

In [8]:
from trl import SFTTrainer

max_seq_length = 512 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
import gc

del model
del tokenizer

gc.collect()

torch.cuda.empty_cache()

In [10]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.6431
20,1.4801
30,1.4671
40,1.4327
50,1.4101
60,1.3654
70,1.4015
80,1.3398
90,1.4019
100,1.3712


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=297, training_loss=1.2720503116697575, metrics={'train_runtime': 18853.9311, 'train_samples_per_second': 0.189, 'train_steps_per_second': 0.016, 'total_flos': 5829242105364480.0, 'train_loss': 1.2720503116697575, 'epoch': 3.0})

In [11]:
trainer.save_model()

events.out.tfevents.1723093875.liamsdell.9873.0:   0%|          | 0.00/12.6k [00:00<?, ?B/s]