In [None]:
# Had to pip install jupyter first
# !pip install python-dotenv

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()

True

## Data

In [3]:
import pandas as pd

In [4]:
df_reviews = pd.read_csv("../raw_data/dummy_data.csv", index_col=0)

In [5]:
print(df_reviews.shape, df_reviews.columns)
df_reviews.head(3)

(109, 4) Index(['Product Name', 'Product Description', 'Review Text', 'Rating'], dtype='object')


Unnamed: 0,Product Name,Product Description,Review Text,Rating
0,iPhone 15,The Apple iPhone 15 redefines smartphone innov...,The iPhone 15 is a masterpiece! The sleek desi...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
1,MacBook Pro 2023,Experience the ultimate in computing power wit...,The MacBook Pro 2023 is a game-changer! The pe...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
2,Kindle Paperwhite,"Enjoy reading your favorite books anytime, any...",The Kindle Paperwhite is a must-have for book ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."


In [6]:
# Check out some descriptions to use as input
df_reviews["Product Name"].sample(1).iloc[0]

'Google Nest Learning Thermostat'

## Criteria generation

### A) Langchain - OpenAI

Le Wagon's requirements file for using LangChain: https://wagon-public-datasets.s3.amazonaws.com/deep_learning_datasets/langchain_requirements.txt

In [9]:
!pip --quiet install langchain langchain-community langchain-openai chromadb

In [8]:
from langchain.chains import RetrievalQA
# from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.schema.document import Document

In [37]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs

def embed_texts_openai(texts, openai_api_key):
    print(f"Embedding {len(texts)} texts...", end=' ')
    # Instantiate an embedder
    embedder = OpenAIEmbeddings(openai_api_key=openai_api_key)

    # Use the embedder to populate a Chroma vector store with our texts.
    doc_search = Chroma.from_documents(texts, embedder)
    print("✅")
    return doc_search

def run_qa(doc_search, prompt, openai_api_key):
    print(f"Running QA...", end=' ')

    # Retrieval QA
    # - chain_type="stuff": the model 'stuffs' all our texts into a single prompt (sufficiently small)
    # - model: latest GPT-3.5-Turbo model.
    qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai_api_key),
        chain_type="stuff",
        retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

In [11]:
OPEN_API_KEY = os.environ.get('OPENAI_API_KEY')
PRODUCT_INPUT = 'Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer'


chunks = get_text_chunks(PRODUCT_INPUT)
chunks

[Document(page_content='Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer')]

In [12]:
doc_search = embed_texts_openai(chunks, OPEN_API_KEY)
doc_search

Embedding 1 texts... ✅


<langchain_community.vectorstores.chroma.Chroma at 0x1157524a0>

In [13]:
prompt = """
Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
"""
answer = run_qa(doc_search, prompt, OPEN_API_KEY)

print(f"Product: {PRODUCT_INPUT}\n")
print(f"Some rating criteria:\n{answer}")

Running QA... ✅
Product: Maybelline Instant Age Rewind Eraser Dark Circles Treatment Concealer

Some rating criteria:
1. Coverage (how well it conceals dark circles)
2. Longevity (how long the concealer lasts without creasing or fading)
3. Application (ease of application and blending)
4. Shade range (variety of shades available)
5. Packaging (convenience and effectiveness of the packaging)
6. Skincare benefits (any improvements in the appearance of dark circles over time)


### B) Langchain - all products and reviews

In [16]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.streaming_stdout_final_only import FinalStreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate

#### Model

Pick a model from the "Model Explorer" section on the [GPT4All page](https://gpt4all.io/index.html).

In [15]:
MODEL_NAME = 'mistral-7b-openorca.gguf2.Q4_0.gguf'  # Change here
MODEL_PATH = '../models/' + MODEL_NAME

# -C - option to continue transfer automatically (so reuse file if already downloaded)
!curl -C - -o {MODEL_PATH} https://gpt4all.io/models/gguf/{MODEL_NAME}
!ls -lh ../models

** Resuming transfer from byte position 4108928128
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 27242    0 27242    0     0  76073      0 --:--:-- --:--:-- --:--:-- 76094
total 8025256
-rw-r--r--  1 joannerobert  staff   3.8G Mar 20 12:12 mistral-7b-openorca.gguf2.Q4_0.gguf


In [18]:
# Callback that supports token-wise streaming but will only return the final output
# rather than intermediary steps
callbacks = [FinalStreamingStdOutCallbackHandler()]

# verbose=True is required for the callback manager
llm = GPT4All(model=MODEL_PATH, callbacks=callbacks, verbose=True)
llm

GPT4All(verbose=True, callbacks=[<langchain.callbacks.streaming_stdout_final_only.FinalStreamingStdOutCallbackHandler object at 0x115753e50>], model='../models/mistral-7b-openorca.gguf2.Q4_0.gguf', client=<gpt4all.gpt4all.GPT4All object at 0x116c52200>)

#### LLM chain

| Prompts |
| :--- |
| Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review. |
| Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.<br>Do not provide more criteria and don't add any more text. Do not write any review.|
| Given this product title, please produce between 3 and 6 criteria to rate in order to compose a product review.<br>The answer should be in the format:<br>1. <1st criterium><br>2. <2nd criterium><br>...|
| For this product, please produce between 3 and 6 criteria that could be rated by a user for a review. No examples needed.
| For this product, please produce between 3 and 6 criteria that could be rated by a user for a review. No more details needed. |

In [33]:
# Best so far
original_prompt = """
For this product, please produce between 3 and 6 criteria that could be rated by a user for a review. No more details needed.
"""

template = f"""Product: '{{product_text}}'
{original_prompt}"""
prompt = PromptTemplate(template=template, input_variables=["product_text"])

In [34]:
# Create the LLM chain
llm_chain = LLMChain(prompt=prompt, llm=llm, return_final_only=True)
llm_chain

LLMChain(prompt=PromptTemplate(input_variables=['product_text'], template="Product: '{product_text}'\n\nFor this product, please produce between 3 and 6 criteria that could be rated by a user for a review. No more details needed.\n"), llm=GPT4All(verbose=True, callbacks=[<langchain.callbacks.streaming_stdout_final_only.FinalStreamingStdOutCallbackHandler object at 0x115753e50>], model='../models/mistral-7b-openorca.gguf2.Q4_0.gguf', client=<gpt4all.gpt4all.GPT4All object at 0x116c52200>))

In [35]:
%%time
res = llm_chain.run(product_text=PRODUCT_INPUT)
# Note that the result needs to be printed explicitly to be shown properly since
# it contains line returns
print(res)


1) Coverage - How well does the concealer cover up dark circles?
2) Consistency - Is it easy to apply or is it too thick/thin?
3) Longevity - Does it last throughout the day without fading, creasing, or settling into fine lines?
4) Shade Range - Are there enough shades available for different skin tones?
5) Packaging - Is the packaging user-friendly and easy to use?
6) Price - How affordable is this product compared to similar products on the market?
CPU times: user 2min 50s, sys: 1.36 s, total: 2min 51s
Wall time: 44 s


#### Retrieval QA chain

##### Prompt only

In [40]:
from langchain.embeddings import HuggingFaceEmbeddings


EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
PERSIST_DIRECTORY = '../db/chroma_3/'


def embed_texts_hg(texts, openai_api_key):
    print(f"Embedding {len(texts)} texts...", end=' ')
    
    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vector_db = Chroma.from_documents(
        docs=texts, 
        embedding=embedder,
        persist_directory=persist_directory
    )
    print("✅")
    return vector_db

In [None]:
# chunks = get_text_chunks(PRODUCT_INPUT)
# chunks

# doc_search = embed_texts_openai(chunks, OPEN_API_KEY)
# doc_search

# def run_qa(doc_search, prompt, openai_api_key):
#     print(f"Running QA...", end=' ')

#     # Retrieval QA
#     # - chain_type="stuff": the model 'stuffs' all our texts into a single prompt (sufficiently small)
#     # - model: latest GPT-3.5-Turbo model.
#     qa = RetrievalQA.from_chain_type(
#         llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai_api_key),
#         chain_type="stuff",
#         retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
#     )

#     answer = qa.invoke(prompt)
#     print("✅")
#     return answer["result"]
    

# prompt = """
# Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
# """
# answer = run_qa(doc_search, prompt, OPEN_API_KEY)


In [None]:
# Loading from actual files

# def load_file(file_path):
#     print(f"Loading {file_path}...", end=' ')
#     try:
#         loader = TextLoader(file_path)
#         documents = loader.load()
#     except FileNotFoundError:
#         print(f"File not found: {file_path}")
#         return

#     # A) Recursive splitter
#     splitter = (RecursiveCharacterTextSplitter
#                 .from_language(language=Language.PYTHON, chunk_size=2000, chunk_overlap=200))
#     # B) Text splitter
#     # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

#     texts = splitter.split_documents(documents)
#     print("✅")
#     return texts

## Results analysis


Generate outputs for many products, analyse results to validate patterns, look out for exceptions, derive a safe criteria extraction method.