<a href="https://colab.research.google.com/github/kutyadog/ai_notebooks/blob/main/chatbot_gp_simple_10_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Start it all up!

In [None]:
!pip install openai langchain chromadb
!pip install sentence-transformers
!pip install -qqq InstructorEmbedding==1.0.1 --progress-bar off
!pip install -q gradio
# !pip install -qqq chromadb==0.4.5 --progress-bar off

In [None]:
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders.csv_loader import CSVLoader
import os
import json
import torch

# Setup
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

!mkdir chroma_db

In [None]:
# large originally processed here:
# https://colab.research.google.com/drive/1ti0oVUwEsecwBFTARLD5I3xPf-f2O9dn#scrollTo=eQlQ4ZWgcXDW

Embedding_Model = 'hkunlp/instructor-large' # @param ["hkunlp/instructor-large", "all-MiniLM-L6-v2", "3rd option"] {allow-input: true}
Embedding_Func_Type = 'HuggingFaceInstructEmbeddings' # @param ["HuggingFaceInstructEmbeddings", "SentenceTransformerEmbeddings", "3rd option"]


In [None]:
if Embedding_Func_Type == 'HuggingFaceInstructEmbeddings':
    embedding_function = HuggingFaceInstructEmbeddings(
        model_name=Embedding_Model, model_kwargs={"device": DEVICE}
    )
elif Embedding_Func_Type == 'SentenceTransformerEmbeddings':
    embedding_function = SentenceTransformerEmbeddings(model_name=Embedding_Model)



# Load 'formatted_articles.csv' so that you can split it and embed it into db

In [None]:
!gdown 1aK7p7ZlrX-QD-WWguBPUHfPX5WP-HBy1 -O formatted_articles.csv

In [None]:
loader = CSVLoader(file_path="formatted_articles.csv")
documents = loader.load()

In [None]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
docs = text_splitter.split_documents(documents)



In [None]:
# load docs into Chroma DB
db = Chroma.from_documents(docs, embedding_function)

In [None]:
# save to disk
db2 = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")


# Load already processed db_data into DB
For testing I have two processed db files that can be imported. Since they were both split and embedded with two different models, that will also require different embedding functions.

You should only run big or small.

In [None]:
# @title load smaller db file from google drive & set embedding func
# !gdown 1_Uiv4BFK1v10sLYrYXqfAskOOATzmnXL -O chroma_db/chroma.sqlite3 #smaller db file
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")



In [None]:
# @title load largest db file from google drive & set embedding func
# create the open-source embedding function

!gdown 1swt3Wt5l6_2cGdrWvjSnUKYw7OwmpAT3 -O chroma_db/chroma.sqlite3 #larger db file

#this is for the larger db file
# embedding_function = HuggingFaceInstructEmbeddings(
#     model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}
# )

Downloading...
From: https://drive.google.com/uc?id=1swt3Wt5l6_2cGdrWvjSnUKYw7OwmpAT3
To: /content/chroma_db/chroma.sqlite3
  0% 0.00/20.9M [00:00<?, ?B/s] 88% 18.4M/20.9M [00:00<00:00, 182MB/s]100% 20.9M/20.9M [00:00<00:00, 191MB/s]


In [None]:
# @title then run this
# load from disk
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
db.get()


# Query Db for similar embeddings

In [None]:

# c845f345-e095-4469-8eca-19a1e1d34bcb <--name of dir in chroma_db dir

# query the DB
query = "How can i change my 401k?"
docs = db.similarity_search(query)





In [None]:
# print results
print(docs[2].page_content)

context: The 401(k) plan can help you invest for a comfortable retirement. The plan has the following principal features: If eligible, you may participate in the plan immediately upon hire and, through convenient payroll deductions, you can invest a portion of your pay on a pre-tax basis before federal, state and local income taxes are imposed. You also have the option to make Roth 401(k) contributions and to convert your pre-tax and traditional after-tax assets to Roth assets. Roth 401(k) contributions do not reduce your taxes when you make the contribution, but they accumulate tax-free, and if you hold your Roth contributions in your account until maturity, the investment earnings are never taxed. You also have the option to make “traditional” after-tax contributions to the plan. Earnings on such contributions accumulate on a tax-free basis until withdrawn. (This is not the same as Roth contributions.) If you are age 50 or older, you may make additional pre-tax or Roth 401(k) contrib

# Interface

In [None]:
import gradio as gr

def ask_question(question):
  docs = db.similarity_search(query)
  return docs[0].page_content

demo = gr.Interface(
    fn=ask_question,
    # inputs=["text", "checkbox", gr.Slider(0, 100)],
    inputs=["text"],
    # outputs=["text", "number"],
    outputs = ['html']
)
demo.launch(share=True)