In [2]:
from openai import AzureOpenAI
from datasets import load_dataset
from data_handling import concat_item_metadata, load_data
import os
import numpy as np
import multiprocessing
N_CPUS = multiprocessing.cpu_count()


reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Books", trust_remote_code=True)
reviews = reviews['full']

item_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Books", trust_remote_code=True)
item_meta = item_meta['full']
item_meta = item_meta.map(concat_item_metadata, batched=True, num_proc=N_CPUS)

# https://github.com/hyp1231/AmazonReviews2023/blob/main/blair/sample_pretraining_data.py#L125


Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

Map (num_proc=8):   0%|          | 0/4448181 [00:00<?, ? examples/s]

TypeError: string indices must be integers, not 'str'

In [28]:
reviews = reviews.sort('helpful_vote', reverse=True).take(5000)
# list(reviews.take(100).select_columns(['parent_asin', 'title', 'helpful_vote']))

In [3]:
list(reviews.shuffle(seed=42).take(3).select_columns(['parent_asin', 'title', 'helpful_vote', 'text']))

[{'parent_asin': '1538714620',
  'title': 'A Different Approach to Healthy Diet',
  'helpful_vote': 623,
  'text': 'There are a large number of diet books around, espousing a variety of “healthy” lifestyles, and often these are unsustainable over the long term either for financial or health reasons. Many look at such factors as calories, fat, carbohydrates and more without really considering the foods themselves beyond that. This book isn’t a diet book, per se; instead, it looks at the individual foods and explains how the body uses them to heal and regenerate itself.<br /><br />To cut to the chase on one important point: the focus is very much on natural, unprocessed foods like fruits, vegetables, nuts and fish. There is a definite lack of coverage for land-based meats (beef, pork, lamb) though there is discussion of various types of seafood such as tuna, anchovies, clams and more, and chicken/turkey are also covered.<br /><br />This is not a formal diet book; rather, it is more of a 

In [28]:
all_meta = {}
candidate_asin = reviews.select_columns(["parent_asin"]).to_dict()["parent_asin"]
# efficiently filter item_meta to only include rows where parent_asin is in candidate_asin
item_meta_filtered = item_meta.filter(
    (lambda x: np.isin(x["parent_asin"], candidate_asin)),
    batched=True,
    num_proc=N_CPUS,
)
for row in item_meta_filtered:
    all_meta[row['parent_asin']] = row


In [1]:
from data_handling import load_data, clean_meta, clean_review
reviews, all_meta = load_data()

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

In [12]:
def concat_item_metadata(dp):
    meta = ''
    flag = False
    if dp['title'] is not None:
        meta += dp['title']
        flag = True
    if len(dp['features']) > 0:
        if flag:
            meta += ' '
        meta += ' '.join(dp['features'])
        flag = True
    if len(dp['description']) > 0:
        if flag:
            meta += ' '
        meta += ' '.join(dp['description'])
    dp['cleaned_metadata'] = meta \
        .replace('\t', ' ') \
        .replace('\n', ' ') \
        .replace('\r', '') \
        .strip()
    return dp

concat_item_metadata(all_meta[reviews[0]["parent_asin"]])['cleaned_metadata']



'What Happened A TIME BEST BOOK OF THE YEAR AND NEW YORK TIMES NOTABLE BOOK “In the past, for reasons I try to explain, I’ve often felt I had to be careful in public, like I was up on a wire without a net. Now I’m letting my guard down.” —Hillary Rodham Clinton, from the introduction of What Happened For the first time, Hillary Rodham Clinton reveals what she was thinking and feeling during one of the most controversial and unpredictable presidential elections in history. Now free from the constraints of running, Hillary takes you inside the intense personal experience of becoming the first woman nominated for president by a major party in an election marked by rage, sexism, exhilarating highs and infuriating lows, stranger-than-fiction twists, Russian interference, and an opponent who broke all the rules. This is her most personal memoir yet. In these pages, she describes what it was like to run against Donald Trump, the mistakes she made, how she has coped with a shocking and devasta

In [None]:
reviews.add_column("item_description", all)

In [14]:
# add meta to reviews for 3 sampled reviews
selected_reviews = reviews.shuffle(seed=42).take(3)
selected_meta = [concat_item_metadata(all_meta[_])['cleaned_metadata'] for _ in selected_reviews.select_columns(["parent_asin"]).to_dict()["parent_asin"]]
selected_reviews = selected_reviews.add_column("item_description", selected_meta)



In [15]:
selected_reviews

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'item_description'],
    num_rows: 3
})

In [9]:
str(selected_reviews[0])

'{\'rating\': 5.0, \'title\': \'A Different Approach to Healthy Diet\', \'text\': \'There are a large number of diet books around, espousing a variety of “healthy” lifestyles, and often these are unsustainable over the long term either for financial or health reasons. Many look at such factors as calories, fat, carbohydrates and more without really considering the foods themselves beyond that. This book isn’t a diet book, per se; instead, it looks at the individual foods and explains how the body uses them to heal and regenerate itself.<br /><br />To cut to the chase on one important point: the focus is very much on natural, unprocessed foods like fruits, vegetables, nuts and fish. There is a definite lack of coverage for land-based meats (beef, pork, lamb) though there is discussion of various types of seafood such as tuna, anchovies, clams and more, and chicken/turkey are also covered.<br /><br />This is not a formal diet book; rather, it is more of a field guide to foods that help y

In [50]:
# few-shot prompt openai to review based on
def example_prompts(reviews):
    """
    Generate a list of few-shot prompts based on the reviews. 
    Assumes that the reviews have already been cleaned and have an 'item_description' column.
    """
    prompts = []
    for review in reviews:
        prompts.extend(
            [
                {"role": "user", "content": f"Item Description: {review['item_description']}"},
                {"role": "assistant", "content": f"Title: {review['title']}\nReview: {review['text']}"},
            ]
        )
    return prompts


def get_client():
    client = AzureOpenAI(
        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
        api_version="2024-08-01-preview",
        api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
    return client


client = get_client()

system_prompt = {
    "role": "system",
    "content": "You are a helpful assistant that can review books based on the item metadata.",
}
user_prompt = {
    "role": "user",
    "content": item_meta.shuffle().take(1)['cleaned_metadata'][0]
}
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        system_prompt,
        *example_prompts(selected_reviews),
        user_prompt,
    ],
)


In [51]:
user_prompt['content']

'Production for Print (Portfolio Skills) This book gives graphic designers the confidence to do everything necessary to ensure trouble-free, high-qualityprintingto calibrate images; adjust trapping levels; and mix colors that wont print as something that is a complete surprise. It explains scanning and resolution, and discusses good and bad image formats, describing techniques to make images look good in print. There is advice on how to get accurate quotes from a printer, a checklist to use when sending a job to print, and a glossary of print production terms. About the Author Mark Gatter has worked in the commercial printing industry and as a freelance graphic designer for the last twenty years. He teaches software courses in Adobe Photoshop, Adobe Illustrator, Adobe InDesign, and Quark XPress as well as custom courses covering all aspects of digital repro and prepress.'

In [53]:
print(response.choices[0].message.content)


Title: Essential Guide for Graphic Designers
Review: "Production for Print" serves as an invaluable resource for graphic designers, especially those new to the print industry or seeking to enhance their understanding of print production processes. Mark Gatter's expertise shines throughout the book, as he covers a wide range of topics that are crucial for ensuring high-quality printed materials.<br /><br />One of the book's strengths is its straightforward approach to complex subjects. Gatter simplifies concepts such as image calibration, trapping levels, and color mixing, making them more accessible for readers who may feel overwhelmed by technical jargon. The discussions on scanning and resolution are particularly useful, as they help demystify the common challenges designers face when preparing files for print.<br /><br />Gatter also provides practical insights on working effectively with printers, including tips on how to obtain accurate quotes and what information to provide to avo

['Outcast A breathtaking tale of love in the West--by the author of Autumnfire. Olivia Baron scandalizes the prudish matrons and rough cowboys of Elkhorn by practicing medicine. When a diptheria epidemic breaks out, she can finally prove her mettle to the townspeople--and to the handsome widower Gabe Danaker.']