In [1]:
from transformers import TrainingArguments
from trl import SFTTrainer

In [1]:
from huggingface_hub import login, notebook_login
notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/liam/.cache/huggingface/token
Login successful


In [3]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from datasets import Dataset
import os

# get all .md files in the statements/ directory
md_files = [os.path.join('statements', fn) for fn in os.listdir('statements') if fn.endswith('.md')]

# create loaders for each .md file
loaders = [UnstructuredMarkdownLoader(fn) for fn in md_files]

all_documents = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    md_doc = loader.load()
    updated_md_doc = filter_complex_metadata(md_doc)
    # Extract text content from each document
    text_content = [doc.page_content for doc in updated_md_doc]
    all_documents.extend(text_content)

# Create a Hugging Face dataset
training_set = Dataset.from_dict({"text": all_documents})

print(f"Number of documents: {len(training_set)}")

Loading raw document...statements/DUK_10-Q.md
Loading raw document...statements/META_10-K.md
Loading raw document...statements/K_10-K.md
Loading raw document...statements/MMM_10-K.md
Loading raw document...statements/AXP_10-K.md
Loading raw document...statements/JNJ_10-K.md
Loading raw document...statements/GM_10-K.md
Loading raw document...statements/ADBE_10-Q.md
Loading raw document...statements/CL_10-K.md
Loading raw document...statements/NVDA_10-Q.md
Loading raw document...statements/MSFT_10-Q.md
Loading raw document...statements/F_10-Q.md
Loading raw document...statements/NEE_10-K.md
Loading raw document...statements/VZ_10-K.md
Loading raw document...statements/AMZN_10-Q.md
Loading raw document...statements/CSCO_10-Q.md
Loading raw document...statements/AAPL_10-K.md
Loading raw document...statements/SCHW_10-Q.md
Loading raw document...statements/WFC_10-K.md
Loading raw document...statements/BA_10-K.md
Loading raw document...statements/JPM_10-K.md
Loading raw document...statements/

In [4]:
system_message = """
You are an assistant that summarizes and extracts key figures from financial statements.
Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner,
focusing on the most important figures and datapoints.

Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers. here is the context:

{context}"""

# Define the create_conversation function
def create_conversation(sample):
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": sample["text"]}
        ]
    }

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  torch_dtype="auto",
  device_map="cuda",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)

In [7]:
args = TrainingArguments(
    evaluation_strategy="no",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    learning_rate=1e-4,
    fp16=True,
    max_steps=-1,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_steps=10,
    output_dir="phi3-mini-instruct-4k-edgar-summarization",
    optim="paged_adamw_32bit",
    lr_scheduler_type="linear"
)



In [8]:
max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    args=args,
    peft_config=peft_config,
    train_dataset=training_set,
    eval_dataset=None,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [9]:
import gc

del model
del tokenizer

gc.collect()

torch.cuda.empty_cache()

In [10]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
10,1.6208
20,1.4717
30,1.5948
40,1.4709
50,1.5246
60,1.687
70,1.4514
80,1.6363
90,1.5784
100,1.5023




TrainOutput(global_step=2982, training_loss=1.3935468922038754, metrics={'train_runtime': 25209.6464, 'train_samples_per_second': 0.118, 'train_steps_per_second': 0.118, 'total_flos': 3.411585236415283e+16, 'train_loss': 1.3935468922038754, 'epoch': 2.0})

In [6]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="liamjdavis/phi3-mini-instruct-4k-edgar-summarization", repo_type="model")
api.upload_folder(
    folder_path="phi3-mini-instruct-4k-edgar-summarization",
    repo_id="liamjdavis/phi3-mini-instruct-4k-edgar-summarization",
    repo_type="model"
)

Upload 31 LFS files:   0%|          | 0/31 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1723171451.liamsdell.71346.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723172019.liamsdell.75804.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723172200.liamsdell.76857.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723172371.liamsdell.888.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723175812.liamsdell.6152.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723176022.liamsdell.876.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723176223.liamsdell.2324.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723176371.liamsdell.3483.0:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

events.out.tfevents.1723176695.liamsdell.5915.0:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

events.out.tfevents.1723176910.liamsdell.6229.0:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

events.out.tfevents.1723177202.liamsdell.9467.0:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

events.out.tfevents.1723178416.liamsdell.1183.0:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

events.out.tfevents.1723178950.liamsdell.4809.0:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

events.out.tfevents.1723181604.liamsdell.1336.0:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

events.out.tfevents.1723181845.liamsdell.3836.0:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

events.out.tfevents.1723182230.liamsdell.5907.0:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/liamjdavis/phi3-mini-instruct-4k-edgar-summarization/commit/fb6b8c31e9335044e27b4d9eb9e57e0c7e2ca1c8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='fb6b8c31e9335044e27b4d9eb9e57e0c7e2ca1c8', pr_url=None, pr_revision=None, pr_num=None)