In [20]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()

True

## Data

In [9]:
import pandas as pd

In [28]:
df_reviews = pd.read_csv("../raw_data/dummy_data.csv", index_col=0)

In [30]:
print(df.columns, df_reviews.shape)
df_reviews.head(3)

Index(['Product Name', 'Product Description', 'Review Text', 'Rating'], dtype='object') (109, 4)


Unnamed: 0,Product Name,Product Description,Review Text,Rating
0,iPhone 15,The Apple iPhone 15 redefines smartphone innov...,The iPhone 15 is a masterpiece! The sleek desi...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
1,MacBook Pro 2023,Experience the ultimate in computing power wit...,The MacBook Pro 2023 is a game-changer! The pe...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
2,Kindle Paperwhite,"Enjoy reading your favorite books anytime, any...",The Kindle Paperwhite is a must-have for book ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."


In [36]:
# Check out some descriptions to use as input
df_reviews["Product Name"].sample(1).iloc[0]

'Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer'

## Criteria generation

### A) Langchain - product name for context

In [31]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.schema.document import Document

In [38]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs

def embed_texts(texts, openai_api_key):
    print(f"Embedding {len(texts)} texts...", end=' ')
    # Instantiate an embedder
    embedder = OpenAIEmbeddings(openai_api_key=openai_api_key)

    # Use the embedder to populate a Chroma vector store with our texts.
    doc_search = Chroma.from_documents(texts, embedder)
    print("✅")
    return doc_search

def run_qa(doc_search, prompt, openai_api_key):
    print(f"Running QA...", end=' ')

    # Retrieval QA
    # - chain_type="stuff": the model 'stuffs' all our texts into a single prompt (sufficiently small)
    # - model: latest GPT-3.5-Turbo model.
    qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai_api_key),
        chain_type="stuff",
        retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

In [41]:
OPEN_API_KEY = os.environ.get('OPENAI_API_KEY')
PRODUCT_INPUT = 'Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer'


chunks = get_text_chunks(PRODUCT_INPUT)
chunks

[Document(page_content='Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer')]

In [44]:
doc_search = embed_texts(chunks, OPEN_API_KEY)
doc_search

Embedding 1 texts... ✅


<langchain_community.vectorstores.chroma.Chroma at 0x11e6c86d0>

In [50]:
prompt = """
Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
"""
answer = run_qa(doc_search, prompt, OPEN_API_KEY)

print(f"Product: {PRODUCT_INPUT}\n")
print(f"Some rating criteria:\n{answer}")

Product: Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer

Some rating criteria:
1. Coverage
2. Longevity
3. Blendability
4. Shade Range
5. Packaging
6. Hydration properties


### B) Langchain - all products and reviews

In [None]:
# def load_file(file_path):
#     print(f"Loading {file_path}...", end=' ')
#     try:
#         loader = TextLoader(file_path)
#         documents = loader.load()
#     except FileNotFoundError:
#         print(f"File not found: {file_path}")
#         return

#     # A) Recursive splitter
#     splitter = (RecursiveCharacterTextSplitter
#                 .from_language(language=Language.PYTHON, chunk_size=2000, chunk_overlap=200))
#     # B) Text splitter
#     # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

#     texts = splitter.split_documents(documents)
#     print("✅")
#     return texts