-
Notifications
You must be signed in to change notification settings - Fork 13.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add GitLoader #2851
Add GitLoader #2851
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Git\n", | ||
"\n", | ||
"This notebook shows how to load text files from Git repository." | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"First, let's clone repository to the local disk." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from git import Repo\n", | ||
"\n", | ||
"repo = Repo.clone_from(\n", | ||
" \"https://github.com/hwchase17/langchain\", to_path=\"example_data/test_repo\"\n", | ||
")\n", | ||
"branch = repo.head.reference" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders.git import GitLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = GitLoader(path=\"./example_data/test_repo/\", branch=branch)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"1060" | ||
] | ||
}, | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 27, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"page_content='.venv\\n.github\\n.git\\n.mypy_cache\\n.pytest_cache\\nDockerfile' metadata={'file_path': '.dockerignore', 'file_name': '.dockerignore', 'file_type': ''}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(data[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "ai", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.6" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
from typing import List, Optional | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class GitLoader(BaseLoader): | ||
"""Loads files from a local Git repository into a list of documents. | ||
Currently supports only text files. | ||
|
||
Each document represents one file in the repository. The `path` points to | ||
the local Git repository, and the `branch` specifies the branch to load | ||
files from. By default, it loads from the `main` branch. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
path: str, | ||
branch: Optional[str] = "main", | ||
): | ||
self.path = path | ||
self.branch = branch | ||
|
||
def load(self) -> List[Document]: | ||
try: | ||
from git import Blob, Repo | ||
except ImportError as ex: | ||
raise ImportError( | ||
"Could not import git python package. " | ||
"Please install it with `pip install GitPython`." | ||
) from ex | ||
|
||
repo = Repo(self.path) | ||
repo.git.checkout(self.branch) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should check if the repo exists already, if it does, we should do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done in f43370d There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two alternative ways to load repo: loader = GitLoader(
clone_url="https://github.com/hwchase17/langchain",
repo_path="./example_data/test_repo2/",
branch="master",
) loader = GitLoader(repo_path="./example_data/test_repo1/", branch=branch) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
do we actually want the loader forcing a pull by default instead of using what's on disk? I don't think I would want the loader to cause a local repo to update from a remote unless I explicitly stated to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah agreed, I like having loading from local repo to be default without updating. |
||
|
||
docs: List[Document] = [] | ||
|
||
for item in repo.tree().traverse(): | ||
if isinstance(item, Blob): | ||
file_path = os.path.join(self.path, item.path) | ||
rel_file_path = os.path.relpath(file_path, self.path) | ||
try: | ||
with open(file_path, "rb") as f: | ||
content = f.read() | ||
file_type = os.path.splitext(item.name)[1] | ||
|
||
# loads only text files | ||
if self.is_text_content(content): | ||
metadata = { | ||
"file_path": rel_file_path, | ||
"file_name": item.name, | ||
"file_type": file_type, | ||
} | ||
text_content = content.decode("utf-8", errors="ignore") | ||
doc = Document(page_content=text_content, metadata=metadata) | ||
else: | ||
continue | ||
docs.append(doc) | ||
except Exception as e: | ||
print(f"Error reading file {file_path}: {e}") | ||
|
||
return docs | ||
|
||
@staticmethod | ||
def is_text_content(content: bytes) -> bool: | ||
"""Determines if the content is text based on the content bytes.""" | ||
try: | ||
content.decode("utf-8") | ||
return True | ||
except UnicodeDecodeError: | ||
return False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe return the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in 3a16d35 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should add this to the deps file instead of this try/catch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This common pattern in this project, what makes sense given number of dependencies that are not required for every use case.