In [48]:
import os
import json

import numpy as np
import pandas as pd


from openai import OpenAI


from tqdm.auto import tqdm

SAMPLE_DATASET_CSV_FILE = "sample_dataset.csv"

In [49]:
def processing_dataset(
    df_book_data: pd.DataFrame,
    df_book_ratings: pd.DataFrame) -> pd.DataFrame:
    # remove unuseful columns from ratingd
    COLUMNS_DATASET = {
        "Title": "title",
        "review/score": "review_score",
        "review/summary": "review_summary",
        "review/text": "review_text"
    }
    COLUMN_LIST = list(COLUMNS_DATASET.keys())
    df_book_ratings = df_book_ratings[COLUMN_LIST]
    df_book_ratings = df_book_ratings.rename(columns=COLUMNS_DATASET)
    # remove unuseful columns from books data
    COLUMNS_DATASET = [
        "Title",
        "description",
        "authors",
        "publisher",
        "categories",
    ]
    df_book_data = df_book_data[COLUMNS_DATASET]
    df_book_data = df_book_data.rename(columns={"Title": "title"})
    # merge
    df_merged = pd.merge(left=df_book_ratings, right=df_book_data, on='title')
    df_merged = df_merged.fillna('')
    return df_merged


In [50]:
import hashlib

def generate_document_id(doc):
    combined = json.dumps(doc, sort_keys=True)
    hash_object = hashlib.md5(combined.encode(), usedforsecurity=False)
    hash_hex = hash_object.hexdigest()
    return hash_hex


In [51]:
prompt_template = """
You emulate a review book assitant.
Formulate 5 questions this review book assitant might ask based on a FAQ record. The record
should contain the title of book, the book description, the author and short review, 
and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

title: {title}
authors: {authors}
categories: {categories}
summary: {review_summary}
description: {description}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()



def generate_questions(client, doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [52]:
if not os.path.isfile(SAMPLE_DATASET_CSV_FILE):

    csv_file_1 = './ratings.csv'
    csv_file_2 = './data.csv'

    df1 = pd.read_csv(csv_file_1)
    df2 = pd.read_csv(csv_file_2)

    df = processing_dataset(df2, df1)

    sample_df = df.sample(200)
    sample_df.to_csv(SAMPLE_DATASET_CSV_FILE,index=False)

In [53]:
sample_df = pd.read_csv(SAMPLE_DATASET_CSV_FILE)

In [54]:
sample_df.head()

Unnamed: 0,title,review_score,review_summary,review_text,description,authors,publisher,categories
0,The Case for Easter: Journalist Investigates t...,4.0,Good book for a quick read,This is a good book that's a quick read. The c...,Did Jesus of Nazareth really rise from the dea...,['Lee Strobel'],Zondervan,['Religion']
1,The Case for Faith: A Journalist Investigates ...,5.0,A Compelling Case,"In his original work ""The Case for Christ"", Le...",Investigates issues and doubts that can jeopar...,"['Lee Strobel', 'Jane Vogel']",,['Juvenile Nonfiction']
2,Espresso! Starting and Running Your Own Coffee...,3.0,"A good starter book, but lacks details",I agree with most of the other reviewers. If I...,Learn to start and run your own coffee bar wit...,"['Linda Formichelli', 'Melissa Villanueva']",Penguin,['Business & Economics']
3,Why Love Matters,5.0,Parenting Coach Welcomes Validation for Affect...,I'm recommending this book to all my clients. ...,Why Love Matters explains why loving relations...,['Sue Gerhardt'],Routledge,['Family & Relationships']
4,Aristophanes' Acharnians (Focus Classical Libr...,5.0,Two comedies by Aristophanes in Greek and English,The Loeb Classical Library features the origin...,English translation of Aristophanes' most popu...,"['Aristophanes', 'Jeffrey Henderson']",Focus,['Drama']


In [55]:
documents = sample_df.to_dict(orient='records')
for i, doc in enumerate(documents):
    documents[i]['document_id'] = generate_document_id(doc)

In [56]:
df_documents = pd.DataFrame.from_records(documents)
df_documents.to_csv("df_documents.csv", index=False)

In [57]:
import json
import gzip

def save_jsongz(json_data, json_filepath):
    # Save JSON data to a .json.gz file
    with gzip.open(json_filepath, "wt", encoding="utf-8") as gz_file:
        json.dump(json_data, gz_file)

def read_jsongz(filepath):
    with gzip.open(filepath, "rt", encoding="utf-8") as gz_file:
        data = json.load(gz_file)
    return data


def save_jsongz_bin(json_data, json_filepath):
    with gzip.open(json_filepath, "wb") as gz_file:
        gz_file.write(json.dumps(json_data).encode("utf-8"))


def read_jsongz_bin(json_filepath):
    # Read JSON data from a binary .json.gz file
    with gzip.open(json_filepath, "rb") as gz_file:
        file_content = gz_file.read()  # Read the binary content
        data = json.loads(file_content.decode("utf-8"))  # Decode and load JSON
    return data


In [58]:
from dotenv import load_dotenv

if os.path.isfile(".env"):
    load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [59]:
RESULTS_JSON_FILEPATH = "results.json.gz"


if not  os.path.isfile(RESULTS_JSON_FILEPATH):
    results = {}
    for doc in tqdm(documents): 
        doc_id = doc['document_id']
        if doc_id in results:
            continue
        questions = generate_questions(openai_client, doc)
        results[doc_id] = questions
    save_jsongz_bin(results, RESULTS_JSON_FILEPATH)


results = read_jsongz_bin(RESULTS_JSON_FILEPATH)

100%|██████████| 200/200 [06:10<00:00,  1.85s/it]


In [68]:

ground_truth = []
for rst_key, rst_value in results.items():
    questions = json.loads(rst_value)
    df_temp = df_documents[df_documents["document_id"] == rst_key]
    if df_temp.size == 0:
        continue
    for q in questions:
        item = {}
        item["document_id"] = rst_key
        item["question"] = q
        item["title"] = df_temp.iloc[0]["title"]
        item["authors"] = df_temp.iloc[0]["authors"]
        item["title"] = df_temp.iloc[0]["title"]
        item["categories"] = df_temp.iloc[0]["categories"]
        ground_truth.append(item)


In [69]:
df_ground_truth = pd.DataFrame.from_records(ground_truth)
df_ground_truth.to_csv("df_ground_truth.csv", index=False)

In [70]:
df_ground_truth = pd.read_csv("df_ground_truth.csv")

In [71]:
df_ground_truth.head()

Unnamed: 0,document_id,question,title,authors,categories
0,77612833d7fd891bbd5300974dd06ec6,What evidence does Lee Strobel present to supp...,The Case for Easter: Journalist Investigates t...,['Lee Strobel'],['Religion']
1,77612833d7fd891bbd5300974dd06ec6,How does the book address claims that Jesus ne...,The Case for Easter: Journalist Investigates t...,['Lee Strobel'],['Religion']
2,77612833d7fd891bbd5300974dd06ec6,What arguments are made concerning the empty t...,The Case for Easter: Journalist Investigates t...,['Lee Strobel'],['Religion']
3,77612833d7fd891bbd5300974dd06ec6,In what ways does the book explore the appeara...,The Case for Easter: Journalist Investigates t...,['Lee Strobel'],['Religion']
4,77612833d7fd891bbd5300974dd06ec6,How does the author use his background as a le...,The Case for Easter: Journalist Investigates t...,['Lee Strobel'],['Religion']
