<a href="https://colab.research.google.com/github/kmalhotra18/Product-Pricer/blob/main/Predict_Product_Prices_Training_your_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Product Prices



## Training your model!


In [2]:
# pip installs

# trl - library from huggingface that includes SFT TRainer library to train our model
!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece wandb matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# imports

import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt

In [4]:
# Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "kmalhotra18" # your HF name here!

# Data

DATASET_NAME = f"{HF_USER}/lite-data"
MAX_SEQUENCE_LENGTH = 182

# Run name for saving the model in the hub

RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"       # Run name is current date & time
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"         # Project run name will be project run on specific date (so you can try to train with different hyperparameters etc.)
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Hyperparameters for QLoRA

LORA_R = 8 # 32 can be on heavier box
LORA_ALPHA = 16 # 2R
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

# Hyperparameters for Training

EPOCHS = 1 # you can do more epochs if you wish, but only 1 is needed - more is probably overkill
BATCH_SIZE = 1 # on an A100 box this can go up to 16                # Adjustment to data point after each batch of 4/8/16/32 etc. data points (for performance mainly)
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4       # Take model / training data point and do forward pass (i.e., predict next token or prob. of next possible token). Take prediction & actual to see loss, and in backward propagation do a shift in weights (i.e., learning rate)
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03                           # Start with lower starting rate and warm it up as you progress (to reduce initial learning rate)
OPTIMIZER = "paged_adamw_32bit"               # Use optimizer to update your neural network for better outcomes

# Admin config - note that SAVE_STEPS is how often it will upload to the hub
# I've changed this from 5000 to 2000 so that you get more frequent saves

STEPS = 50
SAVE_STEPS = 2000
LOG_TO_WANDB = True

%matplotlib inline

In [5]:
HUB_MODEL_NAME

'kmalhotra18/pricer-2025-05-25_16.34.34'

# More on Optimizers

https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one#optimizer-choice

The most common is Adam or AdamW (Adam with Weight Decay).  
Adam achieves good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory footprint of the order of the number of model parameters.

In [6]:
# Log in to HuggingFace

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [7]:
# Log in to Weights & Biases
wandb_api_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

[34m[1mwandb[0m: Currently logged in as: [33mmalhotra-kunal[0m ([33mkunal-malhotra[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

README.md:   0%|          | 0.00/412 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/9.84M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/780k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
# if you wish to reduce the training dataset to 20,000 points instead, then uncomment this line:
train = train.select(range(20000))

In [10]:
len(train)

20000

In [11]:
train[0]

{'text': 'How much does this cost to the nearest dollar?\n\nand Replacement Range Cooktop Drip Pans fit GE, Hotpoint - Two 6 Inch and Two 8 Inch Pans (4 pieces)\nContents 2 x (6 inches) and 2 x (8 inches) bowls, 4 drip bowls total Compatibility This replacement kit works with GE, Hotpoint, Moffat, Monogram (GE), Profile (GE), RCA (GE), and Roper models prior to 1996. replaces 65975, replaces and 65974, 770169 Premium quality Drip bowls are made of durable high-quality material. It features a chrome finish, well-tested by the manufacturer. Durable, stick-free, easy to clean, and dishwasher safe. Ensure long-lasting and effective performance Easy to install Shut off electrical power, tilt the coil\n\nPrice is $12.00',
 'price': 11.99}

In [12]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

## Now load the Tokenizer and Model

The model is "quantized" - we are reducing the precision to 4 bits.

In [13]:
# pick the right quantization. Can also try with 8_BIT

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [14]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Memory footprint: 5591.5 MB


# Data Collator

It's important that we ensure during Training that we are not trying to train the model to predict the description of products; only their price.

We need to tell the trainer that everything up to "Price is $" is there to give context to the model to predict the next token, but does not need to be learned.

The trainer needs to teach the model to predict the token(s) after "Price is $".

There is a complicated way to do this by setting Masks, but luckily HuggingFace provides a super simple helper class to take care of this for us.

In [15]:
from trl import DataCollatorForCompletionOnlyLM
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# AND NOW

## We set up the configuration for Training

We need to create 2 objects:

A LoraConfig object with our hyperparameters for LoRA

An SFTConfig with our overall Training parameters

In [16]:
# First, specify the configuration parameters for LoRA

lora_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

# Next, specify the general configuration parameters for training

train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

# And now, the Supervised Fine Tuning Trainer will carry out the fine-tuning
# Given these 2 sets of configuration parameters
# The latest version of trl is showing a warning about labels - please ignore this warning

fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train,
    peft_config=lora_parameters,
    args=train_parameters,
    data_collator=collator
  )

Converting train dataset to ChatML:   0%|          | 0/20000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


After some time, Google might stop your colab.

If your server is stopped, you can follow my colab here to resume from your last save

In [17]:
# Fine-tune!
fine_tuning.train()

# Push our fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")



Step,Training Loss
50,2.4531
100,1.7851
150,1.3545
200,1.5285
250,1.7127
300,1.446
350,1.3879
400,1.4636
450,1.3966
500,1.6186


[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-2000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-4000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-6000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-8000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-10000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-12000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-14000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-16000)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-05-25_16.34.34/checkpoint-18000)... Done. 0.2s
[34m

README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved to the hub: pricer-2025-05-25_16.34.34


In [18]:
if LOG_TO_WANDB:
  wandb.finish()

0,1
train/epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▂▅▄▃▂▂▂█▂▃▃▂▃▃▃▁▃▄▂▃▃▂▅▂▂▃▅▄▅▅▃▂▄▃▅▁▂▂▁▄
train/learning_rate,██████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
train/loss,█▅▄▃▅▆▄▆▅▃▆▃▂▄▄▄▅▅▁▄▃▄▂▄▄▃▃▄▅▃▂▃▂▂▄▄▁▂▄▃
train/mean_token_accuracy,▂▁▅▃▄▃▁▂▂▃▅▁▂▄▃▂▄▅▂▃▃▃▃▄▅▃▃▃▆▅▂▆▅▆▄▃█▃▃▆
train/num_tokens,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█

0,1
total_flos,1.6093451634506957e+17
train/epoch,1.0
train/global_step,20000.0
train/grad_norm,13.82741
train/learning_rate,0.0
train/loss,1.2947
train/mean_token_accuracy,0.70667
train/num_tokens,3570733.0
train_loss,1.35893
train_runtime,7174.9888
