In [188]:
%pip install openai pandas tiktoken langchain azure-identity azure-search-documents==11.4.0b8

250629.59s - pydevd: Sending message related to process being replaced timed-out after 5 seconds

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [189]:
import openai
import os
import pandas as pd
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.document_loaders import DataFrameLoader
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import AzureChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

In [190]:
# Read and filter data

# Reducing to 2000 rows as the ROM for ada is 2100 
df=pd.read_csv(os.path.join(os.getcwd(),'US_Accidents_March23.csv'), usecols=['Start_Time', 'Description', 'City']).tail(2000)
df

Unnamed: 0,Start_Time,Description,City
7726394,2019-08-20 15:16:12,At Hoover Ave/Exit 150 - Accident.,Bloomfield
7726395,2019-08-20 16:09:38,At Allisonville Rd/Exit 35 - Accident. Lane bl...,Indianapolis
7726396,2019-08-20 14:54:26,At I-495/Midtown Tunnel/Exit 8 - Accident. Lan...,New York
7726397,2019-08-20 15:31:59,Between Palisades Pky/Exit 13 and RT-59/Exit 1...,Nanuet
7726398,2019-08-20 18:51:40,Closed at Dublin - Road closed due to accident.,Milton
...,...,...,...
7728389,2019-08-23 18:03:25,At Market St - Accident.,Riverside
7728390,2019-08-23 19:11:30,At Camino Del Rio/Mission Center Rd - Accident.,San Diego
7728391,2019-08-23 19:00:21,At Glassell St/Grand Ave - Accident. in the ri...,Orange
7728392,2019-08-23 19:00:21,At CA-90/Marina Fwy/Jefferson Blvd - Accident.,Culver City


In [191]:
# Clean data

df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed')
df['Start_Time'] = df['Start_Time'].dt.strftime('%Y-%m-%d')
df['Description'] = df['Description'].str.rstrip('.')
df

Unnamed: 0,Start_Time,Description,City
7726394,2019-08-20,At Hoover Ave/Exit 150 - Accident,Bloomfield
7726395,2019-08-20,At Allisonville Rd/Exit 35 - Accident. Lane bl...,Indianapolis
7726396,2019-08-20,At I-495/Midtown Tunnel/Exit 8 - Accident. Lan...,New York
7726397,2019-08-20,Between Palisades Pky/Exit 13 and RT-59/Exit 1...,Nanuet
7726398,2019-08-20,Closed at Dublin - Road closed due to accident,Milton
...,...,...,...
7728389,2019-08-23,At Market St - Accident,Riverside
7728390,2019-08-23,At Camino Del Rio/Mission Center Rd - Accident,San Diego
7728391,2019-08-23,At Glassell St/Grand Ave - Accident. in the ri...,Orange
7728392,2019-08-23,At CA-90/Marina Fwy/Jefferson Blvd - Accident,Culver City


In [192]:
# Merge data

df['merged'] = df.apply(lambda x: x.astype(str)['Description'] + ' on ' + x.astype(str)['Start_Time'] + ' in the city of ' + x.astype(str)['City'], axis=1)
df = df.drop(['Description', 'City', 'Start_Time'], axis=1)
df 


Unnamed: 0,merged
7726394,At Hoover Ave/Exit 150 - Accident on 2019-08-2...
7726395,At Allisonville Rd/Exit 35 - Accident. Lane bl...
7726396,At I-495/Midtown Tunnel/Exit 8 - Accident. Lan...
7726397,Between Palisades Pky/Exit 13 and RT-59/Exit 1...
7726398,Closed at Dublin - Road closed due to accident...
...,...
7728389,At Market St - Accident on 2019-08-23 in the c...
7728390,At Camino Del Rio/Mission Center Rd - Accident...
7728391,At Glassell St/Grand Ave - Accident. in the ri...
7728392,At CA-90/Marina Fwy/Jefferson Blvd - Accident ...


In [193]:
#Check how many tokens it will require to encode all the accidents

tokenizer = tiktoken.get_encoding("cl100k_base")
df['n_tokens'] = df["merged"].apply(lambda x: len(tokenizer.encode(x)))
# Max tokens supported by ada v2 is 8191
df = df[df.n_tokens<8192]
print('Number of accidents: ' + str(len(df))) # print number of accidents remaining in dataset
print('Number of tokens required:' + str(df['n_tokens'].sum())) # print number of tokens

Number of accidents: 2000
Number of tokens required:53153


In [207]:
# Configure open AI

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = "https://us-accidents-data-demo.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = "35ee19429a5143ea885e39496c042cdc"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
model: str = "text-embedding-ada-002"

In [211]:
# Create vector store instance
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment="us-accidents-embedding-demo")
search_endpoint: str = "https://us-accidents-demo.search.windows.net"
search_key: str = "0766x5ptSCsN1hMQZxNlkmFnZFFwcqmXM3gb92kYSTAzSeA6dFl7"
idx_name: str = "us-accidents-demo-idx"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_key,
    index_name=idx_name,
    embedding_function=embeddings.embed_query,
    semantic_configuration_name="us-accidents-demo",
)

In [209]:
# Insert text and embeddings into vector store

loader = DataFrameLoader(df, page_content_column="merged")
documents = loader.load()

In [210]:
# Setup model and retriever
model = AzureChatOpenAI(deployment_name="us-accidents-chat-demo", temperature=0.5)
db = vector_store.from_documents(documents=documents, embedding=embeddings, azure_search_endpoint=search_endpoint, azure_search_key=search_key, index_name=idx_name)
retriever = db.as_retriever()

InvalidRequestError: Resource not found

In [None]:
# Setup chain

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
# Perform same operation on chain

chain.invoke("What happened in Boca Raton in the year 2021?")

'The document does not provide information on what happened in Boca Raton in the year 2021.'

In [None]:
# Perform a same search on the store

docs = vector_store.similarity_search(
    # query="What kind of accidents happended in the city of Bloomfield in the year of 2019",
    query="What happened in Boca Raton in the year 2021",
    k=3,
    search_type="similarity",
)
docs[0].page_content

Convert this notebook to Python app `jupyter nbconvert --to python rag-demo.ipynb`