## Step 1: Setup Environment

In [None]:
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"

In [None]:
%pip install --quiet \
    langchain==0.0.309 \
    "transformers>=4.24,<5" \
    sqlalchemy -U \
    "faiss-cpu>=1.7,<2" \
    "pypdf>=3.8,<4" \
    pinecone-client \
    apache-beam \
    datasets \
    tiktoken \
    "ipywidgets>=7,<8" \
    matplotlib

In [None]:
%pip install --quiet \
    duckduckgo-search  \
    yfinance  \
    pandas_datareader  \
    langchain_experimental \
    pysqlite3 \
    google-search-results

In [None]:
%%bash
apt-get update && apt-get install g++ -y

In [None]:
%pip install -qU --no-cache-dir nemoguardrails==0.5.0

%pip install -qU "faiss-cpu>=1.7,<2" \
                      "langchain==0.0.309" \
                      "pypdf>=3.8,<4" \
                      "ipywidgets>=7,<8"

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
! pip install PyMuPDF==1.22.5

In [None]:
import json
import os
import sys
import boto3
from langchain.embeddings import BedrockEmbeddings

bedrock_client = boto3.client(service_name='bedrock-runtime', 
                              region_name='us-east-1')
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

## Step 2: Create Index with Embeddings

Create an index of the training documents' text embeddings

#### Using Langchain

In [None]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.vectorstores.pgvector import PGVector
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.indexes import VectorstoreIndexCreator
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os
from tqdm import tqdm
import glob
from multiprocessing import Pool
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy
from typing import List, Tuple
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

In [None]:
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyMuPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}


def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()

    raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    """
    Loads all documents from the source documents directory, ignoring specified files
    """
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                pbar.update()

    return results


In [None]:
docs = load_documents("source_documents/")

##!!! PLAY AROUND WITH CHUNKING TECHNIQUE
# See https://www.pinecone.io/learn/chunking-strategies/

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
chunks= text_splitter.split_documents(docs)

In [None]:
##!!! PLAY AROUND WITH EMBEDDINGS MODEL

db = FAISS.from_documents(chunks, bedrock_embeddings,)
db.save_local("bedrock_index")

In [None]:
### To use a different embeddings model! ###

# FOR PUBLIC MODELS
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')

# FOR PRIVATE MODELS
#### from langchain.embeddings import HuggingFaceHubEmbeddings
#### embeddings = HuggingFaceHubEmbeddings(model_name = 'model name', huggingfacehub_api_token='API_TOKEN')

#### from langchain.embeddings import VoyageEmbeddings
#### embeddings = VoyageEmbeddings(model='voyage-01', voyage_api_key="my-api-key")

db = FAISS.from_documents(chunks, embeddings,)
db.save_local("huggingface_index")

In [None]:
# Don't need to recreate embeddings each time, in future, can load the index like so 

db = FAISS.load_local("bedrock_index", bedrock_embeddings)

## Step 3: LLM

In [None]:
##!!! PLAY AROUND WITH LLM

llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock_client, model_kwargs={'max_tokens_to_sample':200})

In [None]:
# To use OpenAI
# from langchain.llms import OpenAI
# llm = OpenAI(model_name = "gpt-3.5-turbo", openai_api_key="YOUR_API_KEY")


In [None]:
query = "What is the square root of 16?"

In [None]:
##!!! PLAY AROUND WITH PROMPT

prompt_template = """

Human: Use the following pieces of context to provide a detailed respone to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context

Question: {question}

Assistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

##!!! PLAY AROUND WITH SEARCH TYPE AND HOW MANY CHUNKS (SEARCH KWARGS) TO SEND AS CONTEXT TO MODEL
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(
        search_type="similarity", search_kwargs={"k": 10}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
answer = qa({"query": query})
print(answer['result'])

answer['source_documents']

## Step 4: Deploy

In [None]:
! pip install --upgrade gradio

In [None]:
import gradio as gr

In [None]:
##!!! PLAY AROUND WITH PROMPT

def ask_q(query, h='gradio var'):

    prompt_template = """

    Human: Use the following pieces of context to provide a detailed respone to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
    <context>
    {context}
    </context

    Question: {question}

    Assistant:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    ##!!! PLAY AROUND WITH SEARCH TYPE AND HOW MANY CHUNKS (SEARCH KWARGS) TO SEND AS CONTEXT TO MODEL
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(
            search_type="similarity", search_kwargs={"k": 10}
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    answer = qa({"query": query})
    return answer['result']


In [None]:
gr.ChatInterface(
    ask_q,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask Me a Question About NASA", container=False, scale=7),
    title="Amazon Bedrock and Claude",
    theme="soft",
    examples=[
        "What was the launch vehicle for the Mars GLobal Surveyor Mission?",
        "How many times did Atlantis orbit the Earth?",
        "Who piloted the first launch of the Space Shuttle Columbia?",
 
        ],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()