In [None]:
# Install necessary Python libraries
!python3 -m pip install --upgrade langchain chromadb openai gitpython tiktoken python-magic ipywidgets

In [None]:
# Import necessary libraries
import os
import magic
import base64
import datetime
from getpass import getpass
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
from langchain.document_loaders import TextLoader 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chat_models import ChatOpenAI




# Set up API keys
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", getpass("OpenAi Token:"))

#Set local path to load files
directories = [
    (
        "LangChain (framework for developing applications powered by language models) source code and docs",
        "/home/lg/Lab/langchain"
    ),
    (
        "My project the File Chat (jupyter notebook that let you talk with your files) source code and docs",
        "/home/lg/Lab/jupyter_notebooks/file_chat"
    ),
]

ignored_paths = [
    '.git',
    '.github',
    'venv',
    'node_modules',
    '.lock',
    '.chroma',
    '.chroma_db',
    '.ipynb_checkpoints',
    # Add more paths to ignored here
]




In [None]:

def is_text_file(filepath):
    file_type = magic.from_file(filepath)
    return 'text' in file_type


In [None]:
def clean_collection_name(input_string):
    # Define a list of common stop words
    stop_words = ["a", "an", "the", "is", "are", "am", "was", "were", "be", "being", "been", "to", "of", "in", "on", "at", "for", "with", "from", "by", "and", "or", "not", "but"]

    # Split the input string into words
    words = input_string.lower().split("/")

    # Remove stop words from the list of words
    important_words = [word for word in words if word not in stop_words]

    # Combine the important words into a new string with underscores
    new_string = "_".join(important_words)[-60:]

    # Remove all punctuation except underscores from the new string
    new_string = "".join(c for c in new_string if c.isalnum() or c == "_")

    return f'db_multi_source{new_string}'

In [None]:
#Set model and timeout

model = 'gpt-4-0314'
timeout = 6000

In [None]:
def initialize_chat_openai(model, timeout):
    return ChatOpenAI(
        model=model,
        timeout=timeout,
        streaming=True,
        callbacks=[StreamingStdOutCallbackHandler()],
        verbose=True,
    )

model = initialize_chat_openai(model, timeout)


In [None]:
def load_documents(dir_path, ignored_paths):
    docs = []
    for dirpath, dirnames, filenames in os.walk(dir_path):
        if any(ignored_path in dirpath for ignored_path in ignored_paths):
            continue

        for file in filenames:
            filepath = os.path.join(dirpath, file)
            if is_text_file(filepath):
                try:
                    loader = TextLoader(filepath, encoding='utf-8')
                    content = loader.load_and_split()
                    docs.extend(content)
                except Exception as e:
                    print(f"Error loading file: {file}, error: {e}")
                    pass
    return docs


In [None]:
def create_retrievers(docs, embeddings, collection_name):
    text_splitter = CharacterTextSplitter(chunk_size=8191, chunk_overlap=0)
    texts = text_splitter.split_documents(docs)
    db = Chroma.from_documents(documents=texts, embeddings=embeddings, persist_directory="chroma_db")
    retriever = db.as_retriever(k=20, collection_name=collection_name)
    return retriever


In [None]:
retriever_tuples = []

for dir in directories:
    name = dir[0]
    dir_path = dir[1]
    collection_name = clean_collection_name(dir_path)

    print(f"Loading documents from {dir_path} into collection {collection_name}")

    docs = load_documents(dir_path, ignored_paths)
    print(f"Loaded {len(docs)} documents")

    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    retriever = create_retrievers(docs, embeddings, collection_name)
    retriever_tuples.append((collection_name, f"Good for answering questions about {name}", retriever))

retriever_names, retriever_descriptions, retrievers = zip(*retriever_tuples)

qa = MultiRetrievalQAChain.from_retrievers(
    llm=model,
    retriever_names=retriever_names,
    retriever_descriptions=retriever_descriptions,
    retrievers=retrievers,
    verbose=True
)


In [None]:



def create_markdown_download_link(chat_history):
    filename = f"chat_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
    with open(filename, "w") as f:
        for q, a in chat_history:
            f.write(f"**Q:** {q}\n\n")
            f.write(f"**A:** {a}\n\n")

    with open(filename, "rb") as f:
        b64 = base64.b64encode(f.read()).decode("utf-8")
    
    download_link = f'<a href="data:text/markdown;base64,{b64}" download="{filename}">Download chat history as Markdown</a>'
    display(Markdown(download_link))


def on_ask_question_button_click(button):
    clear_output()
    display(chat_interface)
    
    question = question_input.value.strip()
    if question.lower() == 'exit' or question == '':
        create_markdown_download_link(chat_history)
        return
    
    answer_data = qa.run(question)
    answer = format_answer(answer_data)
    chat_history.append((question, answer))
    
    for q, a in chat_history:
        display(Markdown(f"**Q:** {q}"))
        display(Markdown(f"**A:** {a}"))

def format_answer(answer_data):
    formatted_answer = answer_data.split("\n")[0]
    return formatted_answer

# The rest of the chat interface code remains the same

question_input = widgets.Text(
    value='',
    placeholder='Enter your question',
    description='Question:',
    layout=widgets.Layout(width='90%')
)

ask_question_button = widgets.Button(
    description='Ask',
    layout=widgets.Layout(width='8%')
)

ask_question_button.on_click(on_ask_question_button_click)

chat_interface = widgets.HBox([question_input, ask_question_button])

chat_history = []

display(chat_interface)


In [None]:
#Usage to write while waiting the model to load the complete answer
