In [1]:
import pandas as pd

In [2]:
scimeta = pd.read_json('scimeta_corpus.json', orient='records', lines=True)
seqdata = pd.read_json('sci_seq_3rev.json', orient='records', lines=True)

In [3]:
seqdata.shape, scimeta.shape

((72194, 9), (5326, 5))

In [4]:
seqdata['reviewerID'].nunique(), seqdata['asin'].nunique()

(10970, 5326)

In [6]:
# import matplotlib.pyplot as plt

# # Calculate the histogram of the number of reviews per reviewer
# reviewer_counts = seqdata['reviewerID'].value_counts()

# # Plot the histogram
# reviewer_counts.hist(bins=30)
# plt.xlabel('Number of Reviews')
# plt.ylabel('Number of Reviewers')
# plt.title('Histogram of Reviews per Reviewer')
# plt.show()

# # Calculate the fraction of reviewers with less than 3 reviews; worth making it core 4 actually
# fraction_with_3_reviews = (reviewer_counts < 4).mean()
# print(f"Fraction of reviewers with less than 4 reviews: {fraction_with_3_reviews:.2%}")


# # Calculate the histogram of the number of reviewers per ASIN
# asin_counts = seqdata['asin'].value_counts()

# # Plot the histogram
# asin_counts.hist(bins=30)
# plt.xlabel('Number of Reviewers')
# plt.ylabel('Number of ASINs')
# plt.title('Histogram of Reviewers per ASIN')
# plt.show()

# # Calculate the fraction of ASINs with less than 3 reviewers
# fraction_with_3_reviewers = (asin_counts < 4).mean()
# print(f"Fraction of ASINs with less than 4 reviewers: {fraction_with_3_reviewers:.2%}")

In [7]:
# meta_asins = scimeta['asin'] # we known these are all unique

# # check if sci5's unique asins appear in metadata
# unique_asins_sci5 = set(seqdata['asin'].unique())
# unique_asins_meta = set(meta_asins)

# # Calculate the fraction of ASINs in sci5 that are in the metadata
# fraction_in_meta = len(unique_asins_sci5.intersection(unique_asins_meta)) / len(unique_asins_sci5)
# print(f"Fraction of ASINs in sci5 that are in the metadata: {fraction_in_meta:.2%}")

In [5]:
seqdata.head() # could also use summary and overall rating in the future

Unnamed: 0,reviewerID,unixReviewTime,asin,overall,summary,title,brand,main_cat,price
0,A0096681Y127OL1H8W3U,1431993600,B0098MLBAO,5,Five Stars,"Herbal Choice Mari Natural Toothgel, Cinnamon ...",Nature's Brands,Health & Personal Care,$14.47
1,A0096681Y127OL1H8W3U,1435104000,B00E8JOCOE,5,Five Stars,Pac-Kit by First Aid Only 25-450 Cotton Tipped...,First Aid Only,Industrial & Scientific,Unknown
2,A0096681Y127OL1H8W3U,1435104000,B00S730YWG,5,Five Stars,"Litmus pH Test Strips, Universal Application (...",LabRat Supplies,Industrial & Scientific,Unknown
3,A0096681Y127OL1H8W3U,1470614400,B0098MLBAO,5,Pearly whites,"Herbal Choice Mari Natural Toothgel, Cinnamon ...",Nature's Brands,Health & Personal Care,$14.47
4,A0196552RI15HI7JB9PW,1435449600,B000FN15YM,5,As described,"18-8 Stainless Steel Flat Washer, Plain Finish...",Small Parts,Industrial & Scientific,$2.00


In [6]:
from collections import defaultdict
from datasets import Dataset, DatasetDict
import pandas as pd
from src.utils.project_dirs import get_hfdata_dir
import random

hfdatasetpath = get_hfdata_dir()

reviewer_asins = defaultdict(list)
for _, row in seqdata.iterrows():
	reviewer_asins[row['reviewerID']].append(row['asin'])

scimeta.columns = ['asin', 'Title', 'Brand', 'Category', 'Price'] # only the corpus asins and their metadata
asins_compact = scimeta[['asin']].copy()
asins_compact['nlang'] = (
	"Title: " + scimeta['Title'] + ". " +
	"Brand: " + scimeta['Brand'] + ". " +
	"Category: " + scimeta['Category'] + ". " +
	"Price: " + scimeta['Price']
)     
asin_dict = asins_compact.set_index('asin')['nlang'].to_dict()  # ASIN -> formatted text of the item
data_dict = {}

amzn_prompt = """Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
{}

### Next item:
{}"""

Nval = 512  # Number of reviewers for validation
random.seed(42) 
# for these reviewers, the train till the (n-3)th item, val will be the n-2th item, test is the last item; 
# for the rest of the users train will use till the n-2th item, no val, test will be the last item
val_rewid = set(random.sample(list(reviewer_asins.keys()), Nval))

# Convert data_dict into a list of records for each split
train_records, val_records, test_records = [], [], []
for reviewer_id, asins_list in reviewer_asins.items():
    n = len(asins_list)
    if n < 3:
        raise ValueError(f"Reviewer {reviewer_id} has less than 3 reviews")
    formatted_asins_list = [asin_dict.get(asin, f"Unknown ASIN: {asin}") for asin in asins_list]

    if reviewer_id in val_rewid: # we form a validation set only with these reviewers
        train_text = amzn_prompt.format("\n".join(formatted_asins_list[:n-3]), formatted_asins_list[n-3])

        val_ptext = amzn_prompt.format("\n".join(formatted_asins_list[:n-2]), "") # val prompt text 
        val_text = amzn_prompt.format("\n".join(formatted_asins_list[:n-2]), formatted_asins_list[n-2]) # val text
        val_seen_asins = asins_list[:n-2]
        val_asin = asins_list[n-2] # this is the eval target used for recall, ndcg calculations
        val_asin_text = formatted_asins_list[n-2]
        val_records.append({
            "reviewer_id": reviewer_id,
            "ptext": val_ptext,
            "text": val_text,
            "seen_asins": val_seen_asins,
            "asin": val_asin,
            "asin_text": val_asin_text
        })
    else:
        train_text = amzn_prompt.format("\n".join(formatted_asins_list[:n-2]), formatted_asins_list[n-2])
    
    test_ptext = amzn_prompt.format("\n".join(formatted_asins_list[:n-1]), "") # test prompt text
    test_text = amzn_prompt.format("\n".join(formatted_asins_list[:n-1]), formatted_asins_list[n-1]) # test text
    test_seen_asins = asins_list[:n-1]
    test_asin = asins_list[n-1] # test target used for recall, ndcg calculations
    test_asin_text = formatted_asins_list[n-1]
    
    train_records.append({
        "reviewer_id": reviewer_id,
        "text": train_text
    })
    test_records.append({
        "reviewer_id": reviewer_id,
        "ptext": test_ptext,
        "text": test_text,
        "seen_asins": test_seen_asins,
        "asin": test_asin,
        "asin_text": test_asin_text
    })
    
# Convert lists to Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(train_records))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_records))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_records))

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

# Save dataset locally
dataset_dict.save_to_disk(str(hfdatasetpath / "Amzn_scientific_2018"))


Saving the dataset (0/1 shards):   0%|          | 0/10970 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10970 [00:00<?, ? examples/s]

In [9]:
train_dataset.shape, val_dataset.shape,test_dataset.shape,

((10970, 2), (512, 5), (10970, 5))

In [28]:
print(val_dataset[0]['ptext'])
print("BREAK")
print(val_dataset[0]['text'])

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: FiiO D3 (D03K) Digital to Analog Audio Converter With Micca 6ft Optical Toslink Cable - 192kHz/24bit Optical and Coaxial DAC. Brand: FiiO. Category: All Electronics. Price: $22.95
Title: Eagle U2-51-S Red Galvanized Steel Type II Gas Safety Can with 7/8&quot; Flex Spout, 5 gallon Capacity, 13.5&quot; Height, 12.5&quot; Diameter. Brand: Eagle. Category: Amazon Home. Price: $81.49
Title: Eagle F-15 HDPE 10&quot; Poly Funnel For Metal Type I Safety Cans, 4&quot; Height, 8&quot; Width, 9&quot; Length. Brand: Eagle. Category: Industrial & Scientific. Price: $4.59
Title: 3M Littmann Classic III Monitoring Stethosco

In [29]:
print(test_dataset[0]['ptext'])
print("BREAK")
print(test_dataset[0]['text'])

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: Herbal Choice Mari Natural Toothgel, Cinnamon &amp; Baking Soda; 3.4floz Glass. Brand: Nature's Brands. Category: Health & Personal Care. Price: $14.47
Title: Pac-Kit by First Aid Only 25-450 Cotton Tipped Applicator with 6&quot; Wooden Shaft (Bag of 100). Brand: First Aid Only. Category: Industrial & Scientific. Price: Unknown
Title: Litmus pH Test Strips, Universal Application (pH 1-14), 2 Packs of 100 Strips. Brand: LabRat Supplies. Category: Industrial & Scientific. Price: Unknown

### Next item:

BREAK
Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 


In [26]:
s = val_dataset[0]['ptext'][-10:]
s[-1] == '\n'

True

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: FiiO D3 (D03K) Digital to Analog Audio Converter With Micca 6ft Optical Toslink Cable - 192kHz/24bit Optical and Coaxial DAC. Brand: FiiO. Category: All Electronics. Price: $22.95
Title: Eagle U2-51-S Red Galvanized Steel Type II Gas Safety Can with 7/8&quot; Flex Spout, 5 gallon Capacity, 13.5&quot; Height, 12.5&quot; Diameter. Brand: Eagle. Category: Amazon Home. Price: $81.49
Title: Eagle F-15 HDPE 10&quot; Poly Funnel For Metal Type I Safety Cans, 4&quot; Height, 8&quot; Width, 9&quot; Length. Brand: Eagle. Category: Industrial & Scientific. Price: $4.59
Title: 3M Littmann Classic III Monitoring Stethosco

In [8]:
print(test_dataset['test_text'][1] + test_dataset['test_asin_text'][1])

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: 18-8 Stainless Steel Flat Washer, Plain Finish, Meets DIN 125, M3 Hole Size, 3.2mm ID, 7mm OD, 0.5mm Nominal Thickness (Pack of 100). Brand: Small Parts. Category: Industrial & Scientific. Price: $2.00
Title: Steel Lock Nut, Plain Finish, Gray (Pack of 100). Brand: Small Parts. Category: Industrial & Scientific. Price: $7.63
Title: Steel Socket Cap Screw, Black Oxide, Plain Finish, Internal Hex Drive, Meets DIN 912, 12mm Length, Fully Threaded, M3-0.5 Metric Coarse Threads (Pack of 100). Brand: Small Parts. Category: Industrial & Scientific. Price: $11.95
Title: SEOH 5 Pack Glass Borosilicate Graduated Beaker

In [20]:
import torch
import gc

# Delete model and tokenizer
# del model
# del tokenizer

# Clear CUDA memory
torch.cuda.empty_cache()

# Force garbage collection
gc.collect()

493

In [13]:
from unsloth import FastLanguageModel
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
print(tokenizer.padding_side, tokenizer.truncation_side, \
      tokenizer.pad_token, tokenizer.eos_token, tokenizer.bos_token, \
      tokenizer.max_len_single_sentence
    )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
left right <|finetune_right_pad_id|> <|end_of_text|> <|begin_of_text|> 131071


In [22]:
type(val_dataset[:10]), type(val_dataset)

(dict, datasets.arrow_dataset.Dataset)

In [2]:
products = asins_compact['nlang'].to_list()

print(products)

product_toklens = [len(tokenizer.encode(text)) for text in products]

NameError: name 'asins_compact' is not defined

In [1]:
import matplotlib.pyplot as plt
import numpy as np


# Plot a histogram of the token lengths in the list product_toklens, also add vertical lines for mean and sd
plt.hist(product_toklens, bins=30, edgecolor='black')
plt.axvline(np.mean(product_toklens), color='red', linestyle='dashed', linewidth=1, label='Mean')
plt.axvline(np.mean(product_toklens) + np.std(product_toklens), color='blue', linestyle='dashed', linewidth=1, label='Mean + SD')
plt.axvline(np.mean(product_toklens) - np.std(product_toklens), color='blue', linestyle='dashed', linewidth=1, label='Mean - SD')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.title('Histogram of Token Lengths')
plt.legend()
plt.show()

NameError: name 'product_toklens' is not defined

In [16]:
# find the number of tokens after tokenizer in each of the test_text 
token_lengths = [len(tokenizer.encode(text)) for text in test_dataset['test_text']]


In [17]:
max(token_lengths)

4620

In [19]:
train_dataset['reviewer_id'][1], test_dataset['reviewer_id'][1], val_dataset['reviewer_id'][1]

('A0196552RI15HI7JB9PW', 'A0196552RI15HI7JB9PW', 'A100WO06OQR8BQ')

In [None]:
from datasets import Dataset, DatasetDict, load_dataset
test_records = [
    {"Name": "Alice", "Age": 25, "City": "New York"},
    {"Name": "Bob", "Age": 30, "City": "San Francisco"},
    {"Name": "Charlie", "Age": 28, "City": "Los Angeles"}
]

# Create DataFrame
df = pd.DataFrame(test_records)
d_t = Dataset.from_pandas(df)

d_t


Dataset({
    features: ['Name', 'Age', 'City'],
    num_rows: 3
})



Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Finding newlines for mmindex:   0%|          | 0.00/252 [00:00<?, ?B/s]

In [2]:
queries = ["What is a cat that doesn't purr?", "a dog is the human's best friend and flies"]

query_tokens_ids = bm25s.tokenize(queries, stemmer=stemmer)

print(query_tokens_ids)
results, scores = retriever.retrieve(query_tokens_ids, k=1) # both results and scores have shape #queries x k, indices have the topk indices of the docs in corpus
print(results, scores)
print(type(results), type(scores))


Split strings:   0%|          | 0/2 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenized(ids=[[3, 6, 2, 7], [4, 8, 5, 0, 1]], vocab={'friend': 0, 'fli': 1, 'doesn': 2, 'what': 3, 'dog': 4, 'best': 5, 'cat': 6, 'purr': 7, 'human': 8})


BM25S Retrieve:   0%|          | 0/2 [00:00<?, ?it/s]

[[{'id': 0, 'text': 'a cat is a feline and likes to purr'}]
 [{'id': 1, 'text': "a dog is the human's best friend and loves to play"}]] [[1.0584376]
 [1.7672995]]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [6]:
from collections import defaultdict
from datasets import load_from_disk
from src.utils.project_dirs import get_hfdata_dir

ds = load_from_disk(str(get_hfdata_dir() / "Amzn_scientific_2018"))


In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['reviewer_id', 'text'],
        num_rows: 10970
    })
    validation: Dataset({
        features: ['reviewer_id', 'text', 'asin', 'asin_text'],
        num_rows: 512
    })
    test: Dataset({
        features: ['reviewer_id', 'text', 'asin', 'asin_text'],
        num_rows: 10970
    })
})

In [8]:
ds['train'][0]

{'reviewer_id': 'A0096681Y127OL1H8W3U',
 'text': "Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). \nEach item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.\nBased on this history, predict **only one** item the customer is most likely to purchase next in the same format.\n\n### Purchase history:\nTitle: Herbal Choice Mari Natural Toothgel, Cinnamon &amp; Baking Soda; 3.4floz Glass. Brand: Nature's Brands. Category: Health & Personal Care. Price: $14.47\nTitle: Pac-Kit by First Aid Only 25-450 Cotton Tipped Applicator with 6&quot; Wooden Shaft (Bag of 100). Brand: First Aid Only. Category: Industrial & Scientific. Price: Unknown\n\n### Next item:\nTitle: Litmus pH Test Strips, Universal Application (pH 1-14), 2 Packs of 100 Strips. Brand: LabRat Supplies. Category: Industrial & Scientific. Price: Unknown"}

In [10]:
print(ds['train'][0]['text'])

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: Herbal Choice Mari Natural Toothgel, Cinnamon &amp; Baking Soda; 3.4floz Glass. Brand: Nature's Brands. Category: Health & Personal Care. Price: $14.47
Title: Pac-Kit by First Aid Only 25-450 Cotton Tipped Applicator with 6&quot; Wooden Shaft (Bag of 100). Brand: First Aid Only. Category: Industrial & Scientific. Price: Unknown

### Next item:
Title: Litmus pH Test Strips, Universal Application (pH 1-14), 2 Packs of 100 Strips. Brand: LabRat Supplies. Category: Industrial & Scientific. Price: Unknown


In [9]:
ds['test'][0]

{'reviewer_id': 'A0096681Y127OL1H8W3U',
 'text': "Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). \nEach item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.\nBased on this history, predict **only one** item the customer is most likely to purchase next in the same format.\n\n### Purchase history:\nTitle: Herbal Choice Mari Natural Toothgel, Cinnamon &amp; Baking Soda; 3.4floz Glass. Brand: Nature's Brands. Category: Health & Personal Care. Price: $14.47\nTitle: Pac-Kit by First Aid Only 25-450 Cotton Tipped Applicator with 6&quot; Wooden Shaft (Bag of 100). Brand: First Aid Only. Category: Industrial & Scientific. Price: Unknown\nTitle: Litmus pH Test Strips, Universal Application (pH 1-14), 2 Packs of 100 Strips. Brand: LabRat Supplies. Category: Industrial & Scientific. Price: Unknown\n\n### Next item:\n",
 'asin': 'B0098MLBAO',
 'asin_text': "Title: He

In [11]:
print(ds['test'][0]['text'])

Below is a customer's purchase history on Amazon, listed in chronological order (earliest to latest). 
Each item is represented by the following format: Title: <item title> Category: <item category> Brand: <item brand> Price: <item price>.
Based on this history, predict **only one** item the customer is most likely to purchase next in the same format.

### Purchase history:
Title: Herbal Choice Mari Natural Toothgel, Cinnamon &amp; Baking Soda; 3.4floz Glass. Brand: Nature's Brands. Category: Health & Personal Care. Price: $14.47
Title: Pac-Kit by First Aid Only 25-450 Cotton Tipped Applicator with 6&quot; Wooden Shaft (Bag of 100). Brand: First Aid Only. Category: Industrial & Scientific. Price: Unknown
Title: Litmus pH Test Strips, Universal Application (pH 1-14), 2 Packs of 100 Strips. Brand: LabRat Supplies. Category: Industrial & Scientific. Price: Unknown

### Next item:

