# Load the OpenAI API key

In [1]:
import os
import openai
import sys

path = os.getcwd()
sys.path.append(path)

from dotenv import load_dotenv
# find local environment file with project config and API keys
env_file = '/environment.env'
dotenv_path = path+env_file
flag = os.path.isfile(dotenv_path)
_ = load_dotenv(dotenv_path)

openai.api_key = os.getenv('OPENAI_API_KEY')
print(openai.api_key)

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


# Parse and load the data

In [2]:
import json
import pandas as pd

In [3]:
data = []
with open ("./data/meta_Appliances.json", "r") as f:
    for l in f:
        data.append(json.loads(l.strip()))

# total length of list, this number equals total number of products
print(len(data))

30445


### convert list into pandas dataframe

In [4]:
data = pd.DataFrame.from_dict(data)

# Feature selection and Data split

In [5]:
selected_data = data[['title','description','brand','feature','main_cat','date', 'price']]
embedding_column = "Title: " + selected_data['title'].map(str) + "; Description: " + selected_data['description'].map(str)+ "; Brand: " + selected_data['brand'].map(str)+ \
"; Feature: "+ selected_data['feature'].map(str) + "; Main Category: "+ selected_data['main_cat'].map(str) + "; Date: "+ selected_data['date'].map(str) + \
"; Price: " + selected_data['price'].map(str)

In [6]:
selected_data.loc[:,'all'] = embedding_column
selected_data['all']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data.loc[:,'all'] = embedding_column


0        Title: Tupperware Freezer Square Round Contain...
1        Title: 2 X Tupperware Pure &amp; Fresh Unique ...
2        Title: The Cigar - Moments of Pleasure; Descri...
3        Title: Caraselle 2X 50G Appliance Descalene; D...
4        Title: EATON Wiring 39CH-SP-L Arrow Hart 1-Gan...
                               ...                        
30440    Title: Bosch 00175338 Switch; Description: ['T...
30441    Title: Bosch 00478807 Panel-Facia; Description...
30442    Title: Bosch 00649288 Ice Maker; Description: ...
30443    Title: Frigidaire 316543810 Knob; Description:...
30444    Title: Bosch 00674704 Pump-Drain; Description:...
Name: all, Length: 30445, dtype: object

# Embedding generation

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [8]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(selected_data,page_content_column="all")

In [9]:
docs = loader.load()

In [10]:
from langchain.vectorstores import FAISS
vectordb = FAISS.from_documents(docs[0:1000], embedding)

### save the database locally for fast access

In [None]:
#vectordb.save_local("faissdb_index")

# Load the chat model

### set temperature=0 to get factual answers based on information retrieval from database

In [11]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-4-1106-preview',temperature=0.0)

# Build the prompt

In [14]:
from langchain.prompts import PromptTemplate
# Prompt with guardrails
template = """You are a chatbot having a conversation with a human. \
Use the following pieces of context (given inside triple square braces) to answer the user's question. \
User's question: {question}

context: [[[{context}]]].\

Instructions:
-> If you don't know the answer, just say that you don't know, don't try to make up an answer. \
-> Respond to the query in American English in a polite and respectful tone. \
-> Do not add anything else to the response. \
-> If there is no information available relevant to the query, just give the response mentioned in triple backticks. \
'''There is no information available for this query in the internal knowledge base. The answer is referred from Wikipedia page.'''\
-> Do not hallucinate
-> do not make up any information by your own.
-> Keep the sentiment of the answer as neutral.
"""
qa_chain_prompt = PromptTemplate.from_template(template)

# Context Retrieval

In [15]:
from langchain.chains import RetrievalQA

In [16]:
# using default chain_type = 'stuff' which stuffs all the relevant data into the prompt
# works well in this case as context window is much bigger than data size and we retain all relevant info
# Additionally stuff requires only ONE call to LLM which reduces overall token cost and inference is also relatively faster
qa_chain_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={'prompt':qa_chain_prompt}
)

# Ask your QUERY HERE !!!

In [17]:
query = "how many items are priced close to $50"

In [18]:
response = qa_chain_stuff({'query':query})

# ANSWER generated by LLM based on context from RAG

In [19]:
response['result']

'Based on the context provided, there is one item priced close to $50:\n\nTitle: 2 Pack of 1051 HUMIDIFIER AIR FILTER; Price: $44.98'

# Manual Evaluation

In [20]:
import langchain
langchain.debug = True

In [21]:
examples = [
    {
        "query": "What is the cost of The Cigar - Moments of Pleasure?",
        "answer": "$150.26"
    },
    {
        "query": "Which main category does ice maker fall under?",
        "answer": "Appliances"
    },
    {
        "query": "Describe Coleman Cable?",
        "answer": "Coleman Cable 09155 10/4-Gauge SRDT 30-Amp Dryer Power Supply Cord, 5-Feet, 4-Wire, 125/250V. \
        Three conductor flat cord. Thick vinyl insulation with right male plug. Strain relief clamp helps prevent cord damage. \
        Made in USA,UL Listed. 10 Gauge If you have immediate questions about application, installation, troubleshooting, or a damaged component, \
        please call CCI Consumer product hotline at 1-800-561-4321 or email questions to: CCI.ConsumerSupport@southwire.com. \
        The Coleman Cable (R) brand is a registered trademark of Coleman Cable Inc."
    },
    {    # edge case as many products from Rival with numerals in model name. algo needs to give more weightage to rice cooker
        "query": "What are the features of Rival RC165 rice cooker?",
        "answer": "Non-stick removable bowl for easy clean up, External steaming basket,Tempered glass lid,Measuring cup and rice ladle included"
    }
]

### Analyze the context and propmt manually in debug mode

In [22]:
qa_chain_stuff({'query':examples[1]["query"]})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Which main category does ice maker fall under?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Which main category does ice maker fall under?",
  "context": "Title: Franklin Chef Stainless Steel Ice Maker; Description: []; Brand: Franklin Chef; Feature: []; Main Category: Appliances; Date: June 26, 2008; Price: \n\nTitle: GE IM4A Icemaker; Description: ['Electronic sensor detects the level of water in the mold and provides consistent cube size. Water temperature is electronically monitored, resulting in faster ice production. - Manufacturer: GE - Country of Manufacture: United States - Manufacturer Part Number: IM4A.']; Brand: General Electri

{'query': 'Which main category does ice maker fall under?',
 'result': 'The main category that an ice maker falls under is Appliances.',
 'source_documents': [Document(page_content='Title: Franklin Chef Stainless Steel Ice Maker; Description: []; Brand: Franklin Chef; Feature: []; Main Category: Appliances; Date: June 26, 2008; Price: ', metadata={'title': 'Franklin Chef Stainless Steel Ice Maker', 'description': [], 'brand': 'Franklin Chef', 'feature': [], 'main_cat': 'Appliances', 'date': 'June 26, 2008', 'price': ''}),
  Document(page_content="Title: GE IM4A Icemaker; Description: ['Electronic sensor detects the level of water in the mold and provides consistent cube size. Water temperature is electronically monitored, resulting in faster ice production. - Manufacturer: GE - Country of Manufacture: United States - Manufacturer Part Number: IM4A.']; Brand: General Electric; Feature: ['Ge Icemaker', 'Manufacturer: Ge', 'Manufacturer Part Number: im4a']; Main Category: Amazon Home; Dat

# LLM assisted evaluation

### compare the 

In [23]:
predictions = {}
for i,_ in enumerate(examples):
    predictions[i] = qa_chain_stuff({'query':examples[i]["query"]})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is the cost of The Cigar - Moments of Pleasure?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is the cost of The Cigar - Moments of Pleasure?",
  "context": "Title: The Cigar - Moments of Pleasure; Description: []; Brand: The Cigar Book; Feature: []; Main Category: Amazon Home; Date: ; Price: $150.26\n\nTitle: Venta #6014035 3PK 1.76OZ Fragrances; Description: ['3 Pack, 1.76 OZ, Venta Combination Fragrance, Contains 1 Bottle Each Of Orange, Relaxing & Anti Cold Fragrances, Orange Fragrance Is Refreshing & Stimulating, Relaxing Fragrance Relieves The Tensions Of The Day & Anti Cold Fragrance Is Uplifting When There Are Colds About.

In [25]:
from langchain.evaluation.qa import QAEvalChain
eval_chain = QAEvalChain.from_llm(llm)

In [26]:
graded_outputs = eval_chain.evaluate(examples, predictions)

[32;1m[1;3m[chain/start][0m [1m[1:chain:QAEvalChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "query": "What is the cost of The Cigar - Moments of Pleasure?",
      "answer": "$150.26",
      "result": "The cost of \"The Cigar - Moments of Pleasure\" is $150.26."
    },
    {
      "query": "Which main category does ice maker fall under?",
      "answer": "Appliances",
      "result": "The main category that an ice maker falls under is Appliances."
    },
    {
      "query": "Describe Coleman Cable?",
      "answer": "Coleman Cable 09155 10/4-Gauge SRDT 30-Amp Dryer Power Supply Cord, 5-Feet, 4-Wire, 125/250V.         Three conductor flat cord. Thick vinyl insulation with right male plug. Strain relief clamp helps prevent cord damage.         Made in USA,UL Listed. 10 Gauge If you have immediate questions about application, installation, troubleshooting, or a damaged component,         please call CCI Consumer product hotline at 1-800-561-4321 or email qu

In [27]:
graded_outputs[0]

{'results': 'CORRECT'}

In [31]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + examples[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: What is the cost of The Cigar - Moments of Pleasure?
Real Answer: $150.26
Predicted Answer: The cost of "The Cigar - Moments of Pleasure" is $150.26.
Predicted Grade: CORRECT

Example 1:
Question: Which main category does ice maker fall under?
Real Answer: Appliances
Predicted Answer: The main category that an ice maker falls under is Appliances.
Predicted Grade: CORRECT

Example 2:
Question: Describe Coleman Cable?
Real Answer: Coleman Cable 09155 10/4-Gauge SRDT 30-Amp Dryer Power Supply Cord, 5-Feet, 4-Wire, 125/250V.         Three conductor flat cord. Thick vinyl insulation with right male plug. Strain relief clamp helps prevent cord damage.         Made in USA,UL Listed. 10 Gauge If you have immediate questions about application, installation, troubleshooting, or a damaged component,         please call CCI Consumer product hotline at 1-800-561-4321 or email questions to: CCI.ConsumerSupport@southwire.com.         The Coleman Cable (R) brand is a registered tr