In [4]:
import streamlit as st
import os

st.set_page_config(layout = "wide")

with st.sidebar:
    DOCS_DIR = os.path.abspath("./uploaded_docs")
    if not os.path.exists(DOCS_DIR):
        os.makedirs(DOCS_DIR)
    st.subheader("Add to the Knowledge Base")
    with st.form("my-form", clear_on_submit=True):
        uploaded_files = st.file_uploader("Upload a file to the Knowledge Base:", accept_multiple_files = True)
        submitted = st.form_submit_button("Upload!")

    if uploaded_files and submitted:
        for uploaded_file in uploaded_files:
            st.success(f"File {uploaded_file.name} uploaded successfully!")
            with open(os.path.join(DOCS_DIR, uploaded_file.name),"wb") as f:
                f.write(uploaded_file.read())

2024-06-02 08:28:06.760 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py [ARGUMENTS]


In [6]:
# コンポーネント #1 - ドキュメントローダー
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

# make sure to export your NVIDIA AI Playground key as NVIDIA_API_KEY!
llm = ChatNVIDIA(model="mixtral_8x7b")
document_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="passage")
query_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="query")

In [7]:
# コンポーネント#2 - エンベディング・モデルとLLM

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import FAISS
import pickle

with st.sidebar:
    # Option for using an existing vector store
    use_existing_vector_store = st.radio("Use existing vector store if available", ["Yes", "No"], horizontal=True)

# Path to the vector store file
vector_store_path = "vectorstore.pkl"

# Load raw documents from the directory
raw_documents = DirectoryLoader(DOCS_DIR).load()


# Check for existing vector store file
vector_store_exists = os.path.exists(vector_store_path)
vectorstore = None
if use_existing_vector_store == "Yes" and vector_store_exists:
    with open(vector_store_path, "rb") as f:
        vectorstore = pickle.load(f)
    with st.sidebar:
        st.success("Existing vector store loaded successfully.")
else:
    with st.sidebar:
        if raw_documents:
            with st.spinner("Splitting documents into chunks..."):
                text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
                documents = text_splitter.split_documents(raw_documents)

            with st.spinner("Adding document chunks to vector database..."):
                vectorstore = FAISS.from_documents(documents, document_embedder)

            with st.spinner("Saving vector store"):
                with open(vector_store_path, "wb") as f:
                    pickle.dump(vectorstore, f)
            st.success("Vector store created and saved.")
        else:
            st.warning("No documents available to process!", icon="⚠️")


In [8]:
# コンポーネント#3 - ベクターデータベースストア
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import FAISS
import pickle

with st.sidebar:
    # Option for using an existing vector store
    use_existing_vector_store = st.radio("Use existing vector store if available", ["Yes", "No"], horizontal=True)

# Path to the vector store file
vector_store_path = "vectorstore.pkl"

# Load raw documents from the directory
raw_documents = DirectoryLoader(DOCS_DIR).load()


# Check for existing vector store file
vector_store_exists = os.path.exists(vector_store_path)
vectorstore = None
if use_existing_vector_store == "Yes" and vector_store_exists:
    with open(vector_store_path, "rb") as f:
        vectorstore = pickle.load(f)
    with st.sidebar:
        st.success("Existing vector store loaded successfully.")
else:
    with st.sidebar:
        if raw_documents:
            with st.spinner("Splitting documents into chunks..."):
                text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
                documents = text_splitter.split_documents(raw_documents)

            with st.spinner("Adding document chunks to vector database..."):
                vectorstore = FAISS.from_documents(documents, document_embedder)

            with st.spinner("Saving vector store"):
                with open(vector_store_path, "wb") as f:
                    pickle.dump(vectorstore, f)
            st.success("Vector store created and saved.")
        else:
            st.warning("No documents available to process!", icon="⚠️")

In [9]:
############################################
# Component #4 - LLM Response Generation and Chat
############################################

st.subheader("Chat with your AI Assistant, Envie!")

if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_messages(
    [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")]
)
user_input = st.chat_input("Can you tell me what NVIDIA is known for?")
llm = ChatNVIDIA(model="mixtral_8x7b")

chain = prompt_template | llm | StrOutputParser()

if user_input and vectorstore!=None:
    st.session_state.messages.append({"role": "user", "content": user_input})
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(user_input)
    with st.chat_message("user"):
        st.markdown(user_input)

    context = ""
    for doc in docs:
        context += doc.page_content + "\n\n"

    augmented_user_input = "Context: " + context + "\n\nQuestion: " + user_input + "\n"

    with st.chat_message("assistant"):
        message_placeholder = st.empty()
        full_response = ""

        for response in chain.stream({"input": augmented_user_input}):
            full_response += response
            message_placeholder.markdown(full_response + "▌")
        message_placeholder.markdown(full_response)
    st.session_state.messages.append({"role": "assistant", "content": full_response})


2024-06-02 08:31:43.575 Session state does not function when running a script without `streamlit run`


AttributeError: st.session_state has no attribute "messages". Did you forget to initialize it? More info: https://docs.streamlit.io/library/advanced-features/session-state#initialization

In [18]:
# 必要なライブラリのインポート
import os
import pickle
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# コンフィグレーション
DOCS_DIR = os.path.abspath("./uploaded_docs")
if not os.path.exists(DOCS_DIR):
    os.makedirs(DOCS_DIR)

# ドキュメントのアップロード (Jupyterでは手動でファイルをアップロードしてディレクトリに配置してください)
# Upload files to DOCS_DIR manually or use appropriate Jupyter notebook upload widget

# Embedding Model and LLMの設定
llm = ChatNVIDIA(model="mixtral_8x7b")
document_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="passage")
query_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="query")

# Vector Database Storeの設定
vector_store_path = "livedoor.pkl"

# ドキュメントの読み込み
raw_documents = DirectoryLoader(DOCS_DIR).load()

# 既存のベクトルストアを使用するかどうかの選択
use_existing_vector_store = "Yes"  # ここを"Yes"または"No"に設定してください

vector_store_exists = os.path.exists(vector_store_path)
vectorstore = None
if use_existing_vector_store == "Yes" and vector_store_exists:
    with open(vector_store_path, "rb") as f:
        vectorstore = pickle.load(f)
    print("既存のベクトルストアを正常にロードしました。")
else:
    if raw_documents:
        print("ドキュメントをチャンクに分割しています...")
        text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
        documents = text_splitter.split_documents(raw_documents)

        print("ドキュメントチャンクをベクトルデータベースに追加しています...")
        vectorstore = FAISS.from_documents(documents, document_embedder)

        print("ベクトルストアを保存しています...")
        with open(vector_store_path, "wb") as f:
            pickle.dump(vectorstore, f)
        print("ベクトルストアが作成され保存されました。")
    else:
        print("処理可能なドキュメントがありません！")

# LLM応答生成とチャット
def chat_with_envie(user_input):
    prompt_template = ChatPromptTemplate.from_messages(
        [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")]
    )

    chain = prompt_template | llm | StrOutputParser()

    if user_input and vectorstore != None:
        retriever = vectorstore.as_retriever()
        docs = retriever.get_relevant_documents(user_input)

        context = ""
        for doc in docs:
            context += doc.page_content + "\n\n"

        augmented_user_input = "Context: " + context + "\n\nQuestion: " + user_input + "\n"

        full_response = ""

        for response in chain.stream({"input": augmented_user_input}):
            full_response += response
        return full_response

# ユーザーからの入力を取得
user_input = "Can you tell me what NVIDIA is known for?"

# チャットを実行
response = chat_with_envie(user_input)
print("Envie:", response)


Error loading file /kqi/git/uploaded_docs/livedoor.pkl


ValueError: Invalid file /kqi/git/uploaded_docs/livedoor.pkl. The FileType.UNK file type is not supported in partition.

In [17]:
import json
import pickle

# jsonlファイルのパス
jsonl_file_path = './uploaded_docs/livedoor.jsonl'

# pickleファイルのパス
pickle_file_path = 'livedoor.pkl'

# jsonlファイルの読み込み
data = []
with open(jsonl_file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

# pickleファイルに保存
with open(pickle_file_path, 'wb') as f:
    pickle.dump(data, f)

print("jsonlファイルをpickleファイルに変換しました。")


jsonlファイルをpickleファイルに変換しました。
