In [1]:
# Set up latest torch instance with Python=3.9
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from datasets import load_dataset
from pathlib import Path
import json
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
torch.cuda.is_available()

False

### Select Model

In [12]:
root_dir = Path()
model_id = root_dir / "weights" / "weights" / "13Bf_hf" # location for model directory, must be in torch format
tokenizer = LlamaTokenizer.from_pretrained(model_id)

In [5]:
model = LlamaForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.float16) # device_map='auto' will use GPU if available

Loading checkpoint shards: 100%|██████████| 6/6 [00:35<00:00,  5.85s/it]


### Sys Prompt and Training Data

In [2]:
sys_prompt = """Pretend you are an observant Assistant, a helpful bot that takes course catalogue data and returns clean JSON object list. If available, the JSON objects should contain for each course in the text a "Number", "Title", "Description", "Prerequisite", and "Credits". There can be multiple Course Numbers and multiple Prerequisites, include all of them.  If the text is not a course description, return "[]"."""

In [3]:
train_dataset = load_dataset('json', data_files='data/train.json', split='train')  

# split train into train and validation
train_dataset = train_dataset.train_test_split(test_size=0.09, shuffle=True)

In [4]:
# The correct json format-
train_dataset['train'][0]

{'filename': '2008_2009_211',
 'output': '[\n    {\n        "Number": [\n            132\n        ],\n        "Title": "Post Bop Ensemble",\n        "Description": "A small jazz group (rhythm section plus two to four horns) specializing in post-1950s repertoire (Wayne Shorter, Chick Corea, etc.) as well as original compositions.",\n        "Prerequisites": "audition",\n        "Credits": "1"\n    },\n    {\n        "Number": [\n            133\n        ],\n        "Title": "Applied Lessons",\n        "Description": "Private instruction in an instru - ment or voice for music minors. Subject to avail - ability of staff. Lab fee required. May be repeated for credit.",\n        "Prerequisites": "successful completion of Level I Examination.",\n        "Credits": "1 OR 2"\n    },\n    {\n        "Number": [\n            149\n        ],\n        "Title": "Soph RecitalPerformance",\n        "Description": "Sem B.M. Candi-dates only",\n        "Prerequisites": "",\n        "Credits": "1"\n    

In [5]:
def formatting_func(example):
    text = f"""<s>[INST] <<SYS>> {sys_prompt} <</SYS>>

```{example['input']}``` 
Make sure to include prerequisites and exclude any 
non-course information. [/INST] {example['output']}
"""
    return text

In [6]:
# formatted data
print(formatting_func(train_dataset['train'][0]))

<s>[INST] <<SYS>> Pretend you are an observant Assistant, a helpful bot that takes course catalogue data and returns clean JSON object list. If available, the JSON objects should contain for each course in the text a "Number", "Title", "Description", "Prerequisite", and "Credits". There can be multiple Course Numbers and multiple Prerequisites, include all of them.  If the text is not a course description, return "[]". <</SYS>>

``` 1 - 6 
132 Post Bop Ensemble  A	small	jazz	group	(rhythm	
section	plus	two	to	four	horns)	specializing	in	post-1950’s	repertoire	(Wayne	Shorter,	Chick	Corea,	etc.)	as well as original compositions. Prerequisite : audition 
Credits: 1 
133 Applied Lessons  Private instruction in an instru -
ment or voice for music minors. Subject to avail -
ability	of	staff.	Lab	fee	required.	May	be	repeated	for	
credit. Prerequisite : successful completion of  Level I 
Examination. Credits: 1 OR 2 134 Applied Lessons  Private i nstruction in an instru-
ment	or	voice	for	mus

In [7]:
def generate_and_tokenize_prompt(prompt):
    return tokenizer(formatting_func(prompt)) # tokenize the data

In [13]:
tokenized_train_dataset = train_dataset["train"].map(generate_and_tokenize_prompt)
tokenized_validate_dataset = train_dataset["test"].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Map: 100%|██████████| 249/249 [00:04<00:00, 61.38 examples/s]
Map: 100%|██████████| 25/25 [00:00<00:00, 63.45 examples/s]


In [14]:
##### Uncomment if you want to evaluate first ######

# eval_prompt = f"""<<SYS>>{sys_prompt}<</SYS>>

# [INST]Text: {tokenized_validate_dataset[10]['input']}[/INST]
# """

# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# model.eval()
# with torch.no_grad():
#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=1000)[0], skip_special_tokens=True))

In [15]:
tokenizer.pad_token = tokenizer.eos_token # llama quirk, have to do it
model.gradient_checkpointing_enable() # makes the training faster

### Set up Model Training with LoRa Config

In [16]:
model.train() # put the model in training mode

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
    )
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=256, # tune this
        lora_alpha=512, # and this
        lora_dropout=0.05,
        # the target modules can also be tuned
        target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ]
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config
try:
    # create peft config
    model, lora_config = create_peft_config(model)
except:
    model, lora_config = create_peft_config(model)

False
'CUDASetup' object has no attribute 'cuda_available'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


trainable params: 1,010,892,800 || all params: 14,026,757,120 || trainable%: 7.20688888637433


In [17]:
from transformers import TrainerCallback
from contextlib import nullcontext
enable_profiler = False
output_dir = "tmp/llama-output"

# also tune-able
config = {
    'lora_config': lora_config,
    'learning_rate': 2.5e-5,
    'num_train_epochs': 2, # especially this one
    'gradient_accumulation_steps': 1,
    'per_device_train_batch_size': 1,
    'gradient_checkpointing': True,
}

# Set up profiler, and connect to wandb.ai
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{output_dir}/logs/tensorboard"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True)
    
    class ProfilerCallback(TrainerCallback):
        def __init__(self, profiler):
            self.profiler = profiler
            
        def on_step_end(self, *args, **kwargs):
            self.profiler.step()

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

### Train the Model, monitor on wandb

In [None]:
from transformers import default_data_collator, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    fp16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    evaluation_strategy="steps",
    eval_steps=10,
    optim="adamw_torch_fused",
    max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

with profiler:
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset= tokenized_validate_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        callbacks=[profiler_callback] if enable_profiler else [],
    )
    # Start training
    trainer.train()

### Wrap-up

In [None]:
# merge the weights, otherwise it will only save as lora weights, which llama.cpp does not support atm
model = model.merge_and_unload()

In [20]:
# save the model
model.save_pretrained("../models/13Bf_finetuned_02")
tokenizer.save_pretrained("../models/13Bf_finetuned_02")

In [22]:
?LlamaForCausalLM.from_pretrained

[0;31mSignature:[0m
[0mLlamaForCausalLM[0m[0;34m.[0m[0mfrom_pretrained[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpretrained_model_name_or_path[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mos[0m[0;34m.[0m[0mPathLike[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0mmodel_args[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mconfiguration_utils[0m[0;34m.[0m[0mPretrainedConfig[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mos[0m[0;34m.[0m[0mPathLike[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_dir[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mos[0m[0;34m.[0m[0mPathLike[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_mismatched_sizes[0m[0;34

In [None]:
model = LlamaForCausalLM.from_pretrained("../models/13Bf_finetuned_02", )

In [21]:
# Evaluate the new model
eval_prompt = f"""<<SYS>>{sys_prompt}<</SYS>>

[INST]Text: {tokenized_validate_dataset[10]['input']}[/INST]
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=1000)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<<SYS>>You are a helpful Assistant, you take course catalogue raw text extract and return a clean JSON object list. If available, the JSON objects should contain for each course in the text a "Number", "Title", "Description", "Prerequisites", and "Credits". There can be multiple Course Numbers include all of them. If the text is not a course description, return '[]'.<</SYS>>

[INST]Text: oselective and
enantioselective processes. Prerequisite: CHEM 241.
CHEM 260. Advanced Physical Chemistry. 3 Credits.
Builds on the concepts from Introductory Physical Chemistry
(CHEM 165). The three major areas of quantum chemistry,
thermodynamics, and kinetics are extended in greater depth, and at
a higher level of mathematical rigor. Prerequisite: CHEM 165. Co-
requisites: CHEM 167 or MATH 121.
CHEM 267. Topics in Physical Chemistry. 1-3 Credits.
Selected topics of current interest in physical chemistry. See Schedule
of Courses for specific titles. May be repeated for credit with different
content. P