# Load the OpenAI API key

In [2]:
import os
import openai
import sys

path = os.getcwd()
sys.path.append(path)

from dotenv import load_dotenv
# find local environment file with project config and API keys
env_file = '/environment.env'
dotenv_path = path+env_file
flag = os.path.isfile(dotenv_path)
_ = load_dotenv(dotenv_path)

openai.api_key = os.getenv('OPENAI_API_KEY')
print(openai.api_key)

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


# Parse and load the data

In [3]:
import json
import pandas as pd

In [4]:
data = []
with open ("./data/meta_Appliances.json", "r") as f:
    for l in f:
        data.append(json.loads(l.strip()))

# total length of list, this number equals total number of products
print(len(data))

30445


### convert list into pandas dataframe

In [5]:
data = pd.DataFrame.from_dict(data)

# Feature selection and Data split

In [6]:
selected_data = data[['title','description','brand','feature','main_cat','date', 'price']]
embedding_column = selected_data['title'].map(str)+selected_data['description'].map(str)+selected_data['brand'].map(str)+selected_data['feature'].map(str) \
                    +selected_data['main_cat'].map(str)+selected_data['date'].map(str)+selected_data['price'].map(str)
#type(embedding_column)
#embedding_column

In [7]:
selected_data.loc[:,'all'] = embedding_column
selected_data['all']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data.loc[:,'all'] = embedding_column


0        Tupperware Freezer Square Round Container Set ...
1        2 X Tupperware Pure &amp; Fresh Unique Covered...
2        The Cigar - Moments of Pleasure[]The Cigar Boo...
3        Caraselle 2X 50G Appliance Descalene['Multi pu...
4        EATON Wiring 39CH-SP-L Arrow Hart 1-Gang Chrom...
                               ...                        
30440    Bosch 00175338 Switch['This is an authorized a...
30441    Bosch 00478807 Panel-Facia['This is an authori...
30442    Bosch 00649288 Ice Maker['This is an authorize...
30443    Frigidaire 316543810 Knob['This is an O.E.M. A...
30444    Bosch 00674704 Pump-Drain['This is an authoriz...
Name: all, Length: 30445, dtype: object

# Embedding generation

In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [9]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(selected_data,page_content_column="all")

In [10]:
docs = loader.load()

In [11]:
from langchain.vectorstores import FAISS
vectordb = FAISS.from_documents(docs[0:20000], embedding)

### save the database locally for fast access

In [12]:
vectordb.save_local("faissdb_index")

# Load the chat model

### set temperature=0 to get factual answers based on information retrieval from database

In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-4-1106-preview',temperature=0.0)

# Build the prompt

In [None]:
from langchain.prompts import PromptTemplate
#response_tone = '''American English \
#in a polite and respectful tone'''
# Prompt
template = """For the input user query, extract the following information from retrieved data: \
title, description, brand, feature, main category, date and price. \
Check if the asked information is available in this data. \
If there is no information available for the query, start the response with the text mentioned in triple backticks \
'''There is no information available for this query in the internal knowledge base. \
The answer is referred from Wikipedia page: {url}''' \
Respond to the query in American English in a polite and respectful tone. \
"""
qa_chain_prompt = PromptTemplate.from_template(template)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
# using default chain_type = 'stuff' which stuffs all the relevant data into the prompt
# works well in this case as context window is much bigger than data size and we retain all relevant info
# Additionally stuff requires only ONE call to LLM which reduces overall token cost and inference is also relatively faster
qa_chain_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={'prompt':qa_chain_prompt}
)

In [None]:
query = "how many items priced close to $50"
#docs = vectordb.similarity_search_with_score(query)

In [None]:
format_query = prompt_template.format_messages(style=response_tone, text=query)

In [None]:
response = qa_chain_stuff({'query':query})

In [None]:
response['result']