# Indexing a GitHub repo

In [1]:
# %pip install GitPython
# %pip install langchain openai
# %pip install -U langchain-community
# %pip install tiktoken
# %pip install faiss-cpu

In [3]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./data/repo/",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch="master",
)

documents = loader.load()

In [4]:
print(documents[0].page_content)

"""
python scripts/release_branch.py anthropic bagatur
"""

import glob
import tomllib
import toml
import subprocess
import sys


def main(*args):
    pkg = args[1]
    if len(args) >= 2:
        user = args[2]
    else:
        user = "auto"
    for path in glob.glob("./libs/**/pyproject.toml", recursive=True):
        if pkg in path:
            break

    with open(path, "rb") as f:
        pyproject = tomllib.load(f)
    major, minor, patch = pyproject["tool"]["poetry"]["version"].split(".")
    patch = str(int(patch) + 1)
    bumped = ".".join((major, minor, patch))
    pyproject["tool"]["poetry"]["version"] = bumped
    with open(path, "w") as f:
        toml.dump(pyproject, f)

    branch = f"{user}/{pkg}_{bumped.replace('.', '_')}"
    print(
        subprocess.run(
            f"git checkout -b {branch}; git commit -am '{pkg}[patch]: Release {bumped}'; git push -u origin {branch}",
            shell=True,
            capture_output=True,
            text=True,
        )
    )


In [5]:
len(documents)

4931

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=1000, 
    chunk_overlap=200
)

documents = python_splitter.split_documents(documents)

In [7]:
documents[0]

Document(metadata={'source': 'scripts\\release_branch.py', 'file_path': 'scripts\\release_branch.py', 'file_name': 'release_branch.py', 'file_type': '.py'}, page_content='"""\npython scripts/release_branch.py anthropic bagatur\n"""\n\nimport glob\nimport tomllib\nimport toml\nimport subprocess\nimport sys')

In [8]:
len(documents)

27820

In [16]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.callbacks import StdOutCallbackHandler


embeddings = OpenAIEmbeddings(show_progress_bar=True, disallowed_special=())
llm = ChatOpenAI()
handler = StdOutCallbackHandler()

In [17]:

from langchain.vectorstores import FAISS
index = FAISS.from_documents(documents, embeddings)
retriever = index.as_retriever()


qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

query = "What is a stuff chain?"

qa.run(query, callbacks=[handler])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [03:17<00:00,  7.07s/it]
  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.26it/s]




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
chain = prompt | fake_llm

def _load_stuff_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = stuff_prompt.PROMPT,
    document_prompt: BasePromptTemplate = stuff_prompt.EXAMPLE_PROMPT,
    document_variable_name: str = "summaries",
    verbose: Optional[bool] = None,
    **kwargs: Any,
) -> StuffDocumentsChain:
    llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)  # type: ignore[arg-type]
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name=document_variable_name,
        document_prompt=document_prompt,
        verbose=verbose,  # type: ignore[arg-type]
        **kwargs,
    )

return StuffDocumentsCha

'A stuff chain is a concept within the code that involves creating a chain of processes related to handling and processing documents or information. It likely involves using a language model to generate or interact with text in some way. The specifics of what a stuff chain does would depend on the implementation details within the codebase.'

In [18]:
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs['fetch_k'] = 200
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

query = "When should I use a map reduce chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.95it/s]




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
class MapReduceChain(Chain):
    """Map-reduce chain."""

    combine_documents_chain: BaseCombineDocumentsChain
    """Chain to use to combine documents."""
    text_splitter: TextSplitter
    """Text splitter to use."""
    input_key: str = "input_text"  #: :meta private:
    output_key: str = "output_text"  #: :meta private:

class MapReduceDocumentsChain(BaseCombineDocumentsChain):
    """Combining documents by mapping a chain over them, then combining results.

    We first call `llm_chain` on each document individually, passing in the
    `page_content` and any other kwargs. This is the `map` step.

    We then process the results of that `map` step in a `reduce` st

'You should use a map reduce chain when you want to combine documents by mapping a chain over them and then combining the results. This is useful when you need to process a large number of documents individually, map a chain over them to get intermediate results, and then combine those results using a reduce step.'

In [19]:
query = "When should I use a map rank chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.39it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
chain = prompt | fake_llm

def _load_map_rerank_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = MAP_RERANK_PROMPT,
    verbose: bool = False,
    document_variable_name: str = "context",
    rank_key: str = "score",
    answer_key: str = "answer",
    callback_manager: Optional[BaseCallbackManager] = None,
    callbacks: Callbacks = None,
    **kwargs: Any,
) -> MapRerankDocumentsChain:
    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=verbose,
        callback_manager=callback_manager,
        callbacks=callbacks,
    )
    return MapRerankDocumentsChain(
        llm_chain=llm_chain,
        rank_key=rank_key,
        a





[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'You should use a Map Rank Chain when you want to combine documents by mapping a chain over them and then reranking the results. This algorithm involves calling an LLMChain on each input document, where the LLMChain is expected to parse the result into an answer and a score. The answer with the highest score is then returned.'