<a href="https://colab.research.google.com/github/kutyadog/ai_notebooks/blob/main/LangChainChatGPT_7_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Start it up

In [None]:
!pip install openai langchain youtube_transcript_api chromadb unstructured tiktoken

In [None]:
import openai
import os

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import YoutubeLoader

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import GoogleDriveLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [None]:
!gdown 1aK7p7ZlrX-QD-WWguBPUHfPX5WP-HBy1 -O formatted_articles.csv

Downloading...
From: https://drive.google.com/uc?id=1aK7p7ZlrX-QD-WWguBPUHfPX5WP-HBy1
To: /content/formatted_articles.csv
  0% 0.00/1.93M [00:00<?, ?B/s]100% 1.93M/1.93M [00:00<00:00, 186MB/s]


# Option 1: Load documents with DirectoryLoader

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
loader = DirectoryLoader("", glob="mydata/data.txt")
txt_docs = loader.load_and_split()



In [None]:
# Going with option 1 (txt files)
# Create embeddings
embeddings = OpenAIEmbeddings()
# Write in DB
txt_docsearch = Chroma.from_documents(txt_docs, embeddings)

# Define LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)

# Create Retriever
# In case answers are cut-off or you get error messages (token limit)
# use different chain_type
qa_txt = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=txt_docsearch.as_retriever())

In [None]:
query = "What is my daughters name?"
qa_txt.run(query)



"Your daughter's name is Fenna."

# Option 2: Load documents (or transcribe) with YoutubeLoader

In [None]:
# @title Option 2: Load documents (or transcribe) with YoutubeLoader
loader = YoutubeLoader(video_id="XZY", language="en")
yt_docs = loader.load_and_split()

In [None]:
# Going with option 2 (yt)

embeddings = OpenAIEmbeddings()
yt_docsearch = Chroma.from_documents(yt_docs, embeddings)

# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

qa_yt = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=yt_docsearch.as_retriever())

In [None]:
query = "What is the key message of this document?"
qa_yt.run(query)

# Option 3: Load documents from Directory

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import DirectoryLoader, TextLoader
loader = DirectoryLoader('/content/mydata', glob='**/*.*', show_progress=True, loader_cls=TextLoader)
txt_docs = loader.load_and_split()

# DirectoryLoader(DRIVE_FOLDER, glob='**/*.json', show_progress=True, loader_cls=TextLoader)


100%|██████████| 1/1 [00:00<00:00, 43.21it/s]


In [None]:
# Going with option 1 (txt files)
# Create embeddings
embeddings = OpenAIEmbeddings()
# Write in DB
txt_docsearch = Chroma.from_documents(txt_docs, embeddings)

# Define LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)

# Create Retriever
# In case answers are cut-off or you get error messages (token limit)
# use different chain_type
qa_txt = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=txt_docsearch.as_retriever())

In [None]:
# query = "What is my daughters name?"
# query = "My LH is 2 times what my FSH is. What could that mean?"
query = "How can I change my 401k?"
qa_txt.run(query)

"You can make changes to your 401(k) at any time. Here are some of the changes you can make:\n\n1. Changing your savings rate: Go to GuidePost > Okta apps > Vanguard. Once you are logged in to your account in the plan, select MANAGE MY MONEY > Change My Paycheck Deduction. \n\n2. Adjusting your investments: To change how your future contributions are invested, go to GuidePost > Okta apps > Vanguard. Once you're logged in select MANAGE MY MONEY > Change my investments > Change paycheck investment mix.\n\n3. Designating beneficiaries: To designate or edit your beneficiaries, go to GuidePost > Okta apps > Vanguard. Once you’re logged in to your account in the plan, select MENU > My Profile > Beneficiaries.\n\n4. Rolling over money from a previous employer’s plan or an IRA into your 401(k): First, contact the current holder of your account and have them send you a rollover check made payable to Vanguard Fiduciary Trust Co., for the benefit of [your name]. Then, go to GuidePost > Okta apps 

# Gradio interface

In [None]:
!pip install gradio

In [None]:
import gradio as gr

# from openai.embeddings_utils import cosine_similarity

def interface_ask_question(question):
  # query = "My LH is 2 times what my FSH is. What could that mean?"

  return qa_txt.run(question)
  # , pd.DataFrame(topAnswers)


  # '<a href="'+ urls[0] +'" target="_blank">'+urls[0]+'</a>'

demo = gr.Interface(
    fn=interface_ask_question,
    # inputs=["text", "checkbox", gr.Slider(0, 100)],
    inputs=["text"],
    # outputs=["text", "number"],
    outputs = ['text']
    # , 'dataframe'
)
demo.launch(share=True)
# demo.launch()

gr.Dataframe()




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://663021d8506beb9a6a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


dataframe

In [None]:
import gradio as gr
def flip_text(x):
    return x[::-1]


def flip_image(x):
    return np.fliplr(x)

# demo = gr.Interface(
#     fn=interface_ask_question,
#     # inputs=["text", "checkbox", gr.Slider(0, 100)],
#     inputs=["text"],
#     # outputs=["text", "number"],
#     outputs = ['text', 'html']
#     # , 'dataframe'
# )

with gr.Blocks() as demo:
    gr.Markdown("Ask GuidePost chatbot.")
    with gr.Tab("Question / Answer"):
      with gr.Row():
        text_input = [ gr.Textbox() ]
        answer_text_output = [ gr.Textbox() ]

      ask_button = gr.Button("Ask")
    with gr.Tab("Data"):
        with gr.Row():
            image_input = gr.Image()
            image_output = gr.Image()

        image_button = gr.Button("Flip")

    with gr.Accordion("Open for More!"):
        gr.Markdown("Look at me...")

    ask_button.click(interface_ask_question, inputs=text_input, outputs=answer_text_output)
    image_button.click(flip_image, inputs=image_input, outputs=image_output)

demo.launch()

# Save to Gdrive

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import GoogleDriveLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# https://www.haihai.ai/gpt-gdrive/

folder_id = "1v8GN3i_Qsh2pyBgNeQx_f5pK0X8xfPUi"
loader = GoogleDriveLoader(
    folder_id=folder_id,
    file_types=["document", "sheet"],
    recursive=False
)
docs = loader.load()

RefreshError: ignored