# Using Langchain, OpenAI and Chroma to build a Question Answering System based on FastAPI docs

In [3]:
## Imports
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import UnstructuredURLLoader

from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [4]:
# Download sitemap.xml file from a website and extract all the links
def get_links(url):
    url = f'{url}/sitemap.xml'
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        links = [link.text for link in soup.find_all('loc')]
        return links
    else:
        print(f'Error: {response.status_code}')
        return None
    
# Download the contents of a webpage given its urls
def download_pages(urls: list[str]):
    data = UnstructuredURLLoader(urls=urls).load()
    return data    

### Load all the environment variables like OpenAI API key

In [6]:

load_dotenv()

True


### URL of FastAPI docs, change this to any other website you want.


In [8]:

base_url = 'https://fastapi.tiangolo.com/'



### Get all the links from the fastapi website, and download all the content


In [9]:

links = get_links(base_url)
docs = download_pages(links[1:3])




### Split all the documents into chunks of 1000 tokens, convert them into embeddings using OpenAI embeddings and store them in a vector store Chroma


In [11]:


text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
split_docs = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(collection_name='webdata',
                           documents=split_docs,
                           embedding=embeddings,persist_directory='index_data')
db.persist()


### Load the embeddings

In [13]:


db = Chroma(collection_name='webdata',
            embedding_function=embeddings,
            persist_directory='index_data/')
query = "What is async code?"

# Fetch similar docs
docs = db.similarity_search(query,k=3)


### Use Language model along with the fasapi docs to explore more. 

In [None]:

llm = ChatOpenAI(model_name='gpt-3.5-turbo',temperature=0.0)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 retriever=db.as_retriever(),
                                 chain_type='stuff')
qa.run('How is async used in fastai')


Hope You found it useful.