Predict Product Prices

In [3]:
# imports
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt

In [4]:
# Tokenizers

LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B"

In [5]:
# Log in to HuggingFace

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [6]:
# Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "hamzabaccouri"

In [7]:
# Data

DATASET_NAME = f"{HF_USER}/software-data"
MAX_SEQUENCE_LENGTH = 182

In [8]:
# Run name for saving the model in the hub

RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

In [9]:
# Hyperparameters for Training

EPOCHS = 2
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-3
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

In [10]:
# Hyperparameters for QLoRA

LORA_R = 8
LORA_ALPHA = 16
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

In [11]:
# Admin config

STEPS = 50
SAVE_STEPS = 5000
LOG_TO_WANDB = True

In [12]:
%matplotlib inline

In [13]:
# Log in to Weights & Biases
wandb_api_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

[34m[1mwandb[0m: Currently logged in as: [33mhamza_baccouri[0m ([33mhamza_baccouri-none[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

README.md:   0%|          | 0.00/410 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.69M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11708 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2927 [00:00<?, ? examples/s]

In [15]:
len(train),len(test)

(11708, 2927)

In [16]:
train[0]

{'text': 'How much does this cost to the nearest dollar?\n\nLawn Mower Green Grass\nLawn Mower Green Grass - oakley cutting grass tractor simulator!The screen is the backyard of your house with green fresh grass, your fingers controls the lawn mower on it!Ever sunday dreamed about your own house?With green grass on the oakley backyard? Want to mow easily your cutting lawn at weekends and feel the smell of new-mown grass?Good job for lazy sunday!Play Lawn Mower Green Grass right now! Lawn Mower Green Grass - oakley cutting grass tractor simulator! The screen is the backyard of your house with green fresh grass, your fingers controls the lawn mower on it! Ever sunday dreamed about your own house? With green grass on the oakley backyard? Want to mow easily your cutting lawn at weekends and feel the\n\nPrice is $1.00',
 'price': 1.0}

In [17]:
test[0]

{'text': 'How much does this cost to the nearest dollar?\n\nFine Bass Tuner\nFine Bass Tuner is a chromatic tuner designed for bass instruments that allows you to quickly and accurately tune your instrument using the built-in mic in your Android device. Features nice and clean interface allowing you to quickly find your pitch.The Fine Bass Tuner is a version of my Fine Chromatic Tuner that was adjusted and designed for bass. I believe that it will work with any low to mid range instrument as it was fine adjusted for the 20 Hz - 300 Hz.This is automatic chromatic tuner that supports every screen size. Fine Bass Tuner works well with devices supporting 44100 16 bit audio input (almost every phone and tablet in mid - high price range) and 22050 16 bit audio input. It still works with the other possible settings\n\nPrice is $',
 'price': 2.0}

In [18]:
# Define the new subset sizes
train_subset_size = 250  # Size of the small training subset
test_subset_size = int(0.2 * train_subset_size)  # 20% of the training subset size for testing

# Sample small subsets for training and testing
small_train = train.shuffle(seed=42).select(range(min(train_subset_size, len(train))))
small_test = test.shuffle(seed=42).select(range(min(test_subset_size, len(test))))

print(f"Small training set size: {len(small_train)}")
print(f"Small test set size: {len(small_test)}")


Small training set size: 250
Small test set size: 50


In [19]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [20]:
# pick the right quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [21]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Memory footprint: 5591.5 MB




Data Collator It's important that we ensure during Training that we are not trying to train the model to predict the description of products; only their price.

We need to tell the trainer that everything up to "Price is $" is there to give context to the model to predict the next token, but does not need to be learned.

The trainer needs to teach the model to predict the token(s) after "Price is $".


In [22]:
from trl import DataCollatorForCompletionOnlyLM
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)



We set up the configuration for Training

We need to create 2 objects:

A LoraConfig object with our hyperparameters for LoRA

An SFTConfig with our overall Training parameters


In [23]:
# First, specify the configuration parameters for LoRA

lora_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

In [24]:
# Next, specify the general configuration parameters for training

train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

In [25]:
# And now, the Supervised Fine Tuning Trainer will carry out the fine-tuning
# Given these 2 sets of configuration parameters

fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=small_train,
    peft_config=lora_parameters,
    tokenizer=tokenizer,
    args=train_parameters,
    data_collator=collator
)

  fine_tuning = SFTTrainer(


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [26]:
# Fine-tune!
fine_tuning.train()

# Push our fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

Step,Training Loss
50,0.899
100,0.7906


[34m[1mwandb[0m: Adding directory to artifact (./pricer-2025-01-17_16.08.06/checkpoint-126)... Done. 0.3s


README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved to the hub: pricer-2025-01-17_16.08.06


In [27]:
if LOG_TO_WANDB:
  wandb.finish()

0,1
train/epoch,▁▆█
train/global_step,▁▆█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,4008745437364224.0
train/epoch,2.0
train/global_step,126.0
train/grad_norm,1.7839
train/learning_rate,0.00011
train/loss,0.7906
train_loss,0.78694
train_runtime,1443.9346
train_samples_per_second,0.346
train_steps_per_second,0.087
