In [34]:
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from langchain_community.llms import LlamaCpp
from langchain_community.vectorstores import Qdrant
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders import DirectoryLoader,TextLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from groq import Groq
from langchain_groq import ChatGroq
import os

# Gradio Interface

In [35]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA 
from langchain_community.vectorstores import Qdrant

In [36]:
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"]='False'

In [37]:
import gradio as gr

In [38]:
def convert_to_text(src_dir,dst_dir):
  if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)
  for root,dirs,files in os.walk(src_dir):
    if '.git' in dirs:
      dirs.remove('.git')
    for file in files:
      if not file.endswith('.jpg'):
        file_path=os.path.join(root,file)
        rel_path=os.path.relpath(file_path,src_dir)
        new_root=os.path.join(dst_dir,os.path.dirname(rel_path))
        os.makedirs(new_root,exist_ok=True)
        #print(file_path)
        try:
          with open(file_path,'r',encoding='utf-8') as f:
            data=f.read()
        except:
          try:
            with open(file_path,'r',encoding='latin-1') as f:
              data=f.read()
          except :
            print(f"Failed to decode the file: {file_path}")
        new_file_path=os.path.join(new_root,file+ ".txt")
        with open(new_file_path,'w',encoding='utf-8') as f:
          f.write(data)

In [39]:
def initialize(documents):
    model="BAAI/bge-small-en-v1.5" #try using nomic embeddings here refer docs
    model_kwargs={"device":"cpu"}
    encode_kwargs={"normalize_embeddings":True}
    embed_model=HuggingFaceBgeEmbeddings(model_name=model,model_kwargs=model_kwargs,encode_kwargs=encode_kwargs)
    
    vectorstore = Qdrant.from_documents(
            documents,
            embed_model,
            path="/local_qdrantdb",  
            collection_name="my_documents",
        )
    retriever=vectorstore.as_retriever(search_kwargs={'k': 2})

    chat_model = ChatGroq(temperature=0,
                    model_name="gemma-7b-it",
                    api_key="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")
    return chat_model,retriever
    

In [40]:
custom_prompt_template = """
DOCUMENT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE"""

In [41]:
def set_custom_prompt():

    prompt = PromptTemplate(template=custom_prompt_template,
                                input_variables=['context', 'question'])
    return prompt

In [42]:
def get_response(query):
    global documents
    dir = f"./converted_{path}"
    text_loader_kwargs={'autodetect_encoding': True}
    loader= DirectoryLoader(dir,glob="**/*.txt",loader_cls=TextLoader,loader_kwargs=text_loader_kwargs)
    repo_files = loader.load()
    #print("Number of files loaded: ",len(repo_files))
    test_splitter=RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=150)
    documents=test_splitter.split_documents(repo_files)
    #print("Number of documents: ", len(documents))
    for doc in documents:
      old_path_with_txt_ext=doc.metadata["source"]
      new_path_without_txt_ext=old_path_with_txt_ext.replace(".txt","")
      doc.metadata.update({"source":new_path_without_txt_ext})
    chat_model,retriever=initialize(documents)
    prompt=set_custom_prompt()
    qa = RetrievalQA.from_chain_type(llm=chat_model,
                                chain_type="stuff",
                                retriever=retriever,
                                return_source_documents=True,
                                chain_type_kwargs={"prompt": prompt})
    response = qa.invoke(query)
    return response['result']

In [43]:
def load_documents(folder_path):
    global path
    os.system(f"git clone {folder_path}")
    path=folder_path.split("/")[-1]
    convert_to_text(f"./{path}",f"./converted_{path}")
    return "Documents Loaded Successfully"
    

In [44]:
app1=gr.Interface(fn=load_documents,inputs=gr.Textbox(label="Enter the link for Github repository for reference "),outputs="text")
app2 = gr.Interface(fn=get_response, inputs= gr.Textbox(label="Enter your question "), outputs="textbox")

In [45]:
demo = gr.TabbedInterface([app1,app2], ["Load Documents","Ask Question"])

In [46]:
demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




--------
