In [18]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

# Preprocess dataset

In [67]:
#Load the dataset from the HuggingFace Hub
rd_ds = load_dataset("xiyuez/red-dot-design-award-product-description")

#Convert to pandas dataframe for convenient processing
rd_df = pd.DataFrame(rd_ds['train'])

rd_df.head()


Unnamed: 0,product,category,description,text
0,Biamp Rack Products,Digital Audio Processors,"“High recognition value, uniform aesthetics an...",Product Name: Biamp Rack Products;\n\nProduct ...
1,V33,Video Camera,The V33 livestreaming video camera ensures hig...,Product Name: V33;\n\nProduct Category: Video ...
2,HP LaserJet 5000-6000 and E700-E800 Series MFPs,Multi-Function Printers,The HP LaserJet 5000 to 6000 Series and E700 t...,Product Name: HP LaserJet 5000-6000 and E700-E...
3,Meaco Arete One 20L Dehumidifier,Heating and Air Conditioning Technology,The Meaco Arete One Dehumidifier is characteri...,Product Name: Meaco Arete One 20L Dehumidifier...
4,théATRE Glass Container for Loose Leaf Tea,Food Containers,The design and colouring of the théATRE Glass ...,Product Name: théATRE Glass Container for Loos...


In [68]:
#Get a 5000 sample subset for fine-tuning purposes
rd_df_sample = rd_df.sample(n=1000, random_state=42)

In [69]:

#Combine the two attributes into an instruction string
rd_df_sample['instruction'] = 'Create a detailed description for the following product: '+ rd_df['product']+', belonging to category: '+ rd_df['category']

rd_df_sample = rd_df_sample[['instruction', 'description']]


In [70]:
#Define template and format data into the template for supervised fine-tuning
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

### Response:\n"""

rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(lambda x: template.format(x))


In [71]:
rd_df_sample.rename(columns={'description': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'] + "\n### End"
rd_df_sample = rd_df_sample[['prompt', 'response']]

rd_df_sample['text'] = rd_df_sample["prompt"] + rd_df_sample["response"]
rd_df_sample.drop(columns=['prompt', 'response'], inplace=True)
rd_df_sample.head()

Unnamed: 0,text
18952,Below is an instruction that describes a task....
12584,Below is an instruction that describes a task....
5702,Below is an instruction that describes a task....
20503,Below is an instruction that describes a task....
2480,Below is an instruction that describes a task....


In [72]:
dataset = Dataset.from_pandas(rd_df_sample).train_test_split(test_size = 0.005, seed=43)

## Examples of results when model is not finetuned and has no relevant information in the training dataset

In [20]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = 'openlm-research/open_llama_3b_v2'
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
model_path) #, load_in_8bit=True, #device_map='auto'

#Pass in a prompt and infer with the model
prompt = 'Q: Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
input_ids=input_ids, max_new_tokens=128
)

print(tokenizer.decode(generation_output[0]))

<s>Q: Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse
A: The product is a mouse that has a smooth surface. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used for computer use. It is a mouse that is used


In [21]:
prompt= """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse

### Response:"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
input_ids=input_ids, max_new_tokens=128
)

print(tokenizer.decode(generation_output[0]))

<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse

### Response:
Corelogic Smooth Mouse is a mouse that is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard time using a mouse. The mouse is designed to be used by people who have a hard


# Finetuning setup

In [98]:
from peft import LoraConfig
...
...

#If only targeting attention blocks of the model
# target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
# target_modules = ['down_proj','up_proj','lm_head']

lora_config = LoraConfig(
r=2,
target_modules = target_modules,
lora_alpha=8,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",)

In [101]:
from transformers import TrainingArguments

base_dir = "."

per_device_train_batch_size = 2
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

training_args = TrainingArguments(
    output_dir=base_dir,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 1.0,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [75]:
# from transformers import BitsAndBytesConfig

# nf4_config = BitsAndBytesConfig(
#   load_in_4bit=True,
#   bnb_4bit_quant_type="nf4",
#   bnb_4bit_use_double_quant=True,
#   bnb_4bit_compute_dtype=torch.bfloat16
# )

In [83]:
model_path = 'openlm-research/open_llama_3b_v2'

tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = 'right'

model = LlamaForCausalLM.from_pretrained(
    model_path )#, device_map='auto', quantization_config=nf4_config)

In [99]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,248,640 || all params: 3,429,722,240 || trainable%: 0.09472020684683784


In [94]:
%env PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

env: PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0


In [102]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    dataset_text_field="text",
    max_seq_length=256,
    args=training_args,
)

# Initiate the training process
#with mlflow.start_run(run_name= 'temp'):
trainer.train()

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 995/995 [00:00<00:00, 9431.96 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 2312.70 examples/s]


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 