Data Curation Part 2
====================

Craft a dataset which is more balanced in terms of prices and combine the Appliances dataset investigated with many other types of product like Electronics and Automotive.

# Dependencies

In [None]:
# imports

import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle
from items import Item

# Setup

In [None]:
# environment

load_dotenv(override=True)
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [None]:
%matplotlib inline

## HuggingFace Token

**IMPORTANT** requires read and write permissions.

Add `HF_TOKEN` to secrets, paste value and toggle on for this notebook.

In [None]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

# Load Datasets

## Step 1. Load in the same dataset as last time

> Completed Appliances with 28,625 datapoints in 0.3 mins

In [None]:
# Load in the same dataset as last time

items = ItemLoader("Appliances").load()

## Step 2. Look for a familiar item

> How much does this cost to the nearest dollar?
>
> Door Pivot Block - Compatible Kenmore KitchenAid Maytag Whirlpool Refrigerator - Replaces - Quick DIY Repair Solution
>
> Pivot Block For Vernicle Mullion Strip On Door - A high-quality exact equivalent for part numbers and Compatibility with major brands - Door Guide is compatible with Whirlpool, Amana, Dacor, Gaggenau, Hardwick, Jenn-Air, Kenmore, KitchenAid, and Maytag. Quick DIY repair - Refrigerator Door Guide Pivot Block Replacement will help if your appliance door doesn't open or close. Wear work gloves to protect your hands during the repair process. Attentive support - If you are uncertain about whether the block fits your refrigerator, we will help. We generally put forth a valiant effort to guarantee you are totally
>
> Price is $17.00

In [None]:
# Look for a familiar item..
print(items[0].prompt)

## Step 3. Scale up

Look at all datasets of all the items that you might find in a large home retail store - electrical, electronic, office and related, but not clothes / beauty / books.

> Loading dataset Automotive
> Completed Automotive with 911,688 datapoints in 11.3 mins
>
> Loading dataset Electronics
> Completed Electronics with 443,473 datapoints in 9.5 mins
>
> Loading dataset Office_Products
> Completed Office_Products with 240,394 datapoints in 3.6 mins
>
> Loading dataset Tools_and_Home_Improvement
> Completed Tools_and_Home_Improvement with 541,051 datapoints in 8.5 mins
>
> Loading dataset Cell_Phones_and_Accessories
> Completed Cell_Phones_and_Accessories with 238,869 datapoints in 7.1 mins
>
> Loading dataset Toys_and_Games
> Completed Toys_and_Games with 340,479 datapoints in 4.6 mins
>
> Loading dataset Appliances
> Completed Appliances with 28,625 datapoints in 0.3 mins
>
> Loading dataset Musical_Instruments
> Completed Musical_Instruments with 66,829 datapoints in 1.1 mins

> A grand total of 2,811,408 items

In [None]:
dataset_names = [
    "Automotive",
    "Electronics",
    "Office_Products",
    "Tools_and_Home_Improvement",
    "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
]

In [None]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load())

# Now, time for a coffee break!!
# By the way, I put the biggest datasets first.. it gets faster.

In [None]:
print(f"A grand total of {len(items):,} items")

## Step 4. Plot the distribution of token counts again

<img src="./../images/Product-Pricer-Curation-Part-2-Token-Distribution.jpg" alt="Distribution of Tokens" />

In [None]:
# Plot the distribution of token counts again

tokens = [item.token_count for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Token counts: Avg {sum(tokens)/len(tokens):,.1f} and highest {max(tokens):,}\n")
plt.xlabel('Length (tokens)')
plt.ylabel('Count')
plt.hist(tokens, rwidth=0.7, color="skyblue", bins=range(0, 300, 10))
plt.show()

## Step 5. Plot the distribution of prices

<img src="./../images/Product-Pricer-Curation-Part-2-Price-Distribution-All-Categories.jpg" alt="Distribution of Prices All Categories" />

In [None]:
# Plot the distribution of prices

prices = [item.price for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.1f} and highest {max(prices):,}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="blueviolet", bins=range(0, 1000, 10))
plt.show()

## Step 6. Count number of items in each product type category

In [None]:
category_counts = Counter()
for item in items:
    category_counts[item.category]+=1

categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

## Step 7. Plot number of items in each category

<img src="./../images/Product-Pricer-Curation-Part-2-Item-Distribution.jpg" alt="Distribution of Items Across Categories" />

In [None]:
# Bar chart by category
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="goldenrod")
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')

plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

# Display the chart
plt.show()

# Balance Dataset

Less heavily scewed to cheap items, with an average that's higher than //$60. Balance out the categories, i.e. fewer Automotive items.

## Step 1. Create a dictionary of items with prices from \\$1 to \\$999

In [None]:
# Create a dict with a key of each price from $1 to $999
# And in the value, put a list of items with that price (to nearest round number)

slots = defaultdict(list)
for item in items:
    slots[round(item.price)].append(item)

## Step 2. Create a dataset that evenly takes from the range of prices and gives more weight to items from certain categories

> There are 408,635 items in the sample

In [None]:
# Create a dataset called "sample" which tries to more evenly take from the range of prices
# And gives more weight to items from categories other than Automotive
# Set random seed for reproducibility

np.random.seed(42)
random.seed(42)
sample = []
for i in range(1, 1000):
    slot = slots[i]
    if i>=240:
        sample.extend(slot)
    elif len(slot) <= 1200:
        sample.extend(slot)
    else:
        weights = np.array([1 if item.category=='Automotive' else 5 for item in slot])
        weights = weights / np.sum(weights)
        selected_indices = np.random.choice(len(slot), size=1200, replace=False, p=weights)
        selected = [slot[i] for i in selected_indices]
        sample.extend(selected)

print(f"There are {len(sample):,} items in the sample")

## Step 3. Plot the distribution of prices in sample

<img src="./../images/Product-Pricer-Curation-Part-2-Price-Distribution-Sample.jpg" alt="Distribution of Prices in Sample" />

Looking at the distribution the dataset looks better: raised the average price and has a smooth-ish population of prices.

In [None]:
# Plot the distribution of prices in sample

prices = [float(item.price) for item in sample]
plt.figure(figsize=(15, 10))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="darkblue", bins=range(0, 1000, 10))
plt.show()

## Step 4. Plot number of items in each category

<img src="./../images/Product-Pricer-Curation-Part-2-Item-Distribution-Sample.jpg" alt="Distribution of Items Across Categories in Sample" />

Automotive still in the lead, but improved somewhat

In [None]:
# OK, we did well in terms of raising the average price and having a smooth-ish population of prices
# Let's see the categories

category_counts = Counter()
for item in sample:
    category_counts[item.category]+=1

categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

# Create bar chart
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="lightgreen")

# Customize the chart
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')

plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

# Display the chart
plt.show()

## Step 5. For another perspective, look at number of items in each category in a pie chart

<img src="./../images/Product-Pricer-Curation-Part-2-Item-Distribution-Sample-Pie.jpg" alt="Distribution of Items Across Categories in Sample" />

In [None]:
# Automotive still in the lead, but improved somewhat
# For another perspective, let's look at a pie

plt.figure(figsize=(12, 10))
plt.pie(counts, labels=categories, autopct='%1.0f%%', startangle=90)

# Add a circle at the center to create a donut chart (optional)
centre_circle = plt.Circle((0,0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('Categories')

# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')  

plt.show()

## Step 6. How does the price vary with the character count of the prompt?

Dataset has been curated so just some final checks.

<img src="./../images/Product-Pricer-Curation-Part-2-Price-to-Character-Count-Prompt.jpg" alt="Price to Character Count Correlation" />

In [None]:
# How does the price vary with the character count of the prompt?

sizes = [len(item.prompt) for item in sample]
prices = [item.price for item in sample]

# Create the scatter plot
plt.figure(figsize=(15, 8))
plt.scatter(sizes, prices, s=0.2, color="red")

# Add labels and title
plt.xlabel('Size')
plt.ylabel('Price')
plt.title('Is there a simple correlation?')

# Display the plot
plt.show()

## Step 7. Check token count for price in prompt

> How much does this cost to the nearest dollar?
>
> MonoRS Coilovers Lowering Kit Made For Scion FRS Fully Adjustable, Set of 4
> MonoRS Coilover damper kit by Godspeed Project are intermediate suspension upgrade setup for daily and Sunday club racing. Lowering your car with improved springs over factory and paired with Mono-tubo shocks with valving that allows 32 levels of rebound adjustment to improve handling without sacrifice comfort. Ride height can easily be adjusted by twisting the lower mount bracket. In order to keep weight gain at the minimum, most of attachments and accessories are CNC machined from billet aluminum. Koyo bearings are used when camber plate top mount is applicable depends on car models. To assure that our customers are getting high quality products, MonoRS coilovers are covered by 12 months limited warranty by the manufacturer from
>
> Price is $765.00<br>
> `[279, 14290, 505, 271, 7117, 374, 400, 22240, 13, 410]` > `[' the', ' manufacturer', ' from', '\n\n', 'Price', ' is', ' $', '765', '.', '00']`

**Observation**: An interesting thing about the Llama tokenizer is that every number from 1 to 999 gets mapped to 1 token, much as we saw with gpt-4o. The same is not true of qwen2, gemma and phi3, which all map individual digits to tokens. This does turn out to be a bit useful for our project, although it's not an essential requirement.

In [None]:
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))

In [None]:
report(sample[398000])

# Create Training, Test and Validation Datasets

Break down our data into a training, test and validation dataset.

It's typical to use 5%-10% of your data for testing purposes, but actually we have far more than we need at this point. We'll take 400,000 points for training, and we'll reserve 2,000 for testing, although we won't use all of them.

## Step 1. Split sample into training and test datasets

> Divided into a training set of 400,000 items and test set of 2,000 items

In [None]:
random.seed(42)
random.shuffle(sample)
train = sample[:400_000]
test = sample[400_000:402_000]
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

## Step 2. Validate training prompt

> How much does this cost to the nearest dollar?
>
> Delphi FG0166 Fuel Pump Module
> Delphi brings 80 years of OE Heritage into each Delphi pump, ensuring quality and fitment for each Delphi part. Part is validated, tested and matched to the right vehicle application Delphi brings 80 years of OE Heritage into each Delphi assembly, ensuring quality and fitment for each Delphi part Always be sure to check and clean fuel tank to avoid unnecessary returns Rigorous OE-testing ensures the pump can withstand extreme temperatures Brand Delphi, Fit Type Vehicle Specific Fit, Dimensions LxWxH 19.7 x 7.7 x 5.1 inches, Weight 2.2 Pounds, Auto Part Position Unknown, Operation Mode Mechanical, Manufacturer Delphi, Model FUEL PUMP, Dimensions 19.7
>
> Price is $227.00

In [None]:
print(train[0].prompt)

## Step 3. Validate test prompt

> How much does this cost to the nearest dollar?
>
> OEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln Mark LT 2007 2008 - BuyAutoParts NEW
> As one of the world's largest automotive parts suppliers, our parts are trusted every day by mechanics and vehicle owners worldwide. This A/C Compressor and Components Kit is manufactured and tested to the strictest OE standards for unparalleled performance. Built for trouble-free ownership and 100% visually inspected and quality tested, this A/C Compressor and Components Kit is backed by our 100% satisfaction guarantee. Guaranteed Exact Fit for easy installation 100% BRAND NEW, premium ISO/TS 16949 quality - tested to meet or exceed OEM specifications Engineered for superior durability, backed by industry-leading unlimited-mileage warranty Included in this K
>
> Price is $

In [None]:
print(test[0].test_prompt())

## Step 4. Plot the distribution of prices in the first 250 test points

<img src="./../images/Product-Pricer-Curation-Part-2-Price-Distribution-Test-Data.jpg" alt="Distribution of Prices in First 250 Test Points" />

In [None]:
# Plot the distribution of prices in the first 250 test points

prices = [float(item.price) for item in test[:250]]
plt.figure(figsize=(15, 6))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="darkblue", bins=range(0, 1000, 10))
plt.show()

# Upload Dataset to HuggingFace Hub

## Step 1. Convert to prompts

In [None]:
train_prompts = [item.prompt for item in train]
train_prices = [item.price for item in train]
test_prompts = [item.test_prompt() for item in test]
test_prices = [item.price for item in test]

## Step 2. Create a Dataset from the lists

In [None]:
# Create a Dataset from the lists

train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

## Step 3. Upload to HuggingFace hub

> CommitInfo(commit_url='https://huggingface.co/datasets/clanredhead/pricer-data/commit/e13a370affb747b226fa022b82244f67773ed404', commit_message='Upload dataset', commit_description='', oid='e13a370affb747b226fa022b82244f67773ed404', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/clanredhead/pricer-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='clanredhead/pricer-data'), pr_revision=None, pr_num=None)

In [None]:
# Uncomment these lines if you're ready to push to the hub, and replace my name with your HF username

HF_USER = "clanredhead"
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset.push_to_hub(DATASET_NAME, private=True)

# Save Data Locally

Pickle the training and test dataset so we don't have to execute all this code next time:

In [None]:
# Let's pickle the training and test dataset so we don't have to execute all this code next time!

with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)

# Todos:

- Investigate the dataset more
- Confirm that the tokenizer tokenizes all 3 digit prices into 1 token