## Packages and Environment

In [None]:
%pip install langchain
%pip install --upgrade --quiet  langchain-openai
%pip install langchainhub
%pip install BeautifulSoup4
%pip install faiss-cpu

## Data Loader

In [2]:
# from langchain_community.document_loaders import WebBaseLoader
# loader = WebBaseLoader("https://www.foxsports.com/nba/lebron-james-player-stats")
# data = loader.load()

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter

loader = DirectoryLoader('data/qna/', glob="*.txt", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})

documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

## Convert to Vector DB

In [9]:
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings

In [10]:
# Option 2: use an Azure OpenAI account with a deployment of an embedding model
azure_endpoint: str = "https://aimco-wus-openai-01.openai.azure.com/"

azure_openai_api_version: str = "2023-05-15"
azure_deployment: str = "text-embedding-ada-002"

In [11]:
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
)

In [None]:
vector_store_address: str = "YOUR_AZURE_SEARCH_ENDPOINT"
vector_store_password: str = "YOUR_AZURE_SEARCH_ADMIN_KEY"

In [None]:
index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(data)

Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="./chroma_db")

## RAG - phi-3

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain import hub
from langchain_community.llms import Ollama

In [None]:
model_name = "phi3:mini"
llm = Ollama(model=model_name)
prompt = hub.pull("rlm/rag-prompt")
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=GPT4AllEmbeddings())

In [None]:
qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": prompt}
    )

In [None]:
question = "Tell me what the stats are saying"
result = qa_chain({"query": question })

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(result["result"])