Skip to content

Commit

Permalink
feat: Parallelize the GithubRepoLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
yroc92 committed Aug 9, 2023
1 parent 5166ba6 commit c8caf3d
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import { GithubFile } from "../../web/github.js";

export const GithubLoaderApis = {
getRepoFiles: {
0: [
{
name: "foo.txt",
path: "foo.txt",
type: "file",
size: 50,
url: "https://githubfilecontent.com",
html_url: "",
sha: "",
git_url: "",
download_url: "",
_links: {
self: "",
git: "",
html: "",
},
},
{
name: "dir1",
path: "dir1",
type: "dir",
size: 50,
url: "https://githubfilecontent.com",
html_url: "",
sha: "",
git_url: "",
download_url: "",
_links: {
self: "",
git: "",
html: "",
},
},
],
1: [
{
name: "dir1_1",
path: "dir1/dir1_1",
type: "dir",
size: 50,
url: "https://githubfilecontent.com",
html_url: "",
sha: "",
git_url: "",
download_url: "",
_links: {
self: "",
git: "",
html: "",
},
},
],
2: [
{
name: "nested_file.txt",
path: "dir1/dir1_1/nested_file.txt",
type: "file",
size: 50,
url: "https://githubfilecontent.com",
html_url: "",
sha: "",
git_url: "",
download_url: "",
_links: {
self: "",
git: "",
html: "",
},
},
{
name: "EXAMPLE.md",
path: "dir1/dir1_1/EXAMPLE.md",
type: "file",
size: 50,
url: "https://githubfilecontent.com",
html_url: "",
sha: "",
git_url: "",
download_url: "",
_links: {
self: "",
git: "",
html: "",
},
},
],
} as Record<string, GithubFile[]>,
getFileContents: "this is a file full of stuff",
};
50 changes: 50 additions & 0 deletions langchain/src/document_loaders/tests/github.unit.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { jest, test } from "@jest/globals";
import { GithubFile, GithubRepoLoader } from "../web/github.js";
import { GithubLoaderApis } from "./example_data/github_api_responses.js";

describe("GithubRepoLoader recursion", () => {
let callCount = 0;
beforeAll(() => {
global.fetch = jest.fn().mockImplementation((url) => {
let responseData: GithubFile[] | string =
GithubLoaderApis.getRepoFiles[callCount.toString()];

if ((url as string).includes("https://api.github.com/repos")) {
responseData = GithubLoaderApis.getRepoFiles[callCount.toString()];
callCount += 1;
} else if ((url as string).includes("githubfilecontent.com")) {
responseData = GithubLoaderApis.getFileContents;
}
return Promise.resolve({
ok: true,
json: () => Promise.resolve(responseData),
text: () => Promise.resolve("this is a file full of stuff"),
});
// eslint-disable-next-line @typescript-eslint/no-explicit-any
}) as any;
});

afterAll(() => {
jest.clearAllMocks();
callCount = 0;
});

test("Test recursion with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/hwchase17/langchainjs",
{
branch: "main",
recursive: true,
unknown: "warn",
ignorePaths: ["*.md"],
}
);

const documents = await loader.load();
expect(documents.length).toBe(2);
expect(documents.map((doc) => doc.metadata.source)).toEqual([
"foo.txt",
"dir1/dir1_1/nested_file.txt",
]);
});
});
114 changes: 84 additions & 30 deletions langchain/src/document_loaders/web/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function isBinaryPath(name: string) {
return extensions.has(extname(name).slice(1).toLowerCase());
}

interface GithubFile {
export interface GithubFile {
name: string;
path: string;
sha: string;
Expand All @@ -29,6 +29,11 @@ interface GithubFile {
};
}

interface GetContentResponse {
contents: string;
metadata: { source: string };
}

export interface GithubRepoLoaderParams {
branch?: string;
recursive?: boolean;
Expand Down Expand Up @@ -110,15 +115,16 @@ export class GithubRepoLoader
}

public async load(): Promise<Document[]> {
const documents: Document[] = [];
await this.processDirectory(this.initialPath, documents);
return documents;
return (await this.processRepo()).map(
(fileResponse) =>
new Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
})
);
}

protected async shouldIgnore(
path: string,
fileType: string
): Promise<boolean> {
protected shouldIgnore(path: string, fileType: string): boolean {
if (fileType !== "dir" && isBinaryPath(path)) {
return true;
}
Expand All @@ -141,34 +147,82 @@ export class GithubRepoLoader
);
}

private async processDirectory(
path: string,
documents: Document[]
): Promise<void> {
try {
const files = await this.fetchRepoFiles(path);
/**
* Takes the file info and wrap it in a promise that will resolve to the file content and metadata
* @param file
* @returns
*/
private async fetchFileContentWrapper(
file: GithubFile
): Promise<GetContentResponse> {
const fileContent = await this.fetchFileContent(file).catch((error) => {
this.handleError(`Failed wrap file content: ${file}, ${error}`);
});
return {
contents: fileContent || "",
metadata: { source: file.path },
};
}

/**
* Maps a list of files / directories to a list of promises that will fetch the file / directory contents
*/
private async getCurrDirFilesPromises(
files: GithubFile[]
): Promise<Promise<GetContentResponse>[]> {
const currDirFilePromises: Promise<GetContentResponse>[] = [];
// Directories have nested files / directories, which is why this is a list of promises of promises
const currDirDirPromises: Promise<Promise<GetContentResponse>[]>[] = [];

for (const file of files) {
if (!(await this.shouldIgnore(file.path, file.type))) {
if (file.type !== "dir") {
try {
const fileContent = await this.fetchFileContent(file);
const metadata = { source: file.path };
documents.push(
new Document({ pageContent: fileContent, metadata })
);
} catch (e) {
this.handleError(
`Failed to fetch file content: ${file.path}, ${e}`
);
}
} else if (this.recursive) {
await this.processDirectory(file.path, documents);
for (const file of files) {
if (!this.shouldIgnore(file.path, file.type)) {
if (file.type !== "dir") {
try {
currDirFilePromises.push(this.fetchFileContentWrapper(file));
} catch (e) {
this.handleError(
`Failed to fetch file content: ${file.path}, ${e}`
);
}
} else if (this.recursive) {
currDirDirPromises.push(this.processDirectory(file.path));
}
}
}

const curDirDirectories: Promise<GetContentResponse>[][] =
await Promise.all(currDirDirPromises);

return [...currDirFilePromises, ...curDirDirectories.flat()];
}

/**
* Begins the process of fetching the contents of the repository
*/
private async processRepo(): Promise<GetContentResponse[]> {
try {
// Get the list of file / directory names in the root directory
const files = await this.fetchRepoFiles(this.initialPath);
// Map the file / directory paths to promises that will fetch the file / directory contents
const currDirPromises = await this.getCurrDirFilesPromises(files);
return Promise.all(currDirPromises);
} catch (error) {
this.handleError(
`Failed to process directory: ${this.initialPath}, ${error}`
);
return Promise.reject(error);
}
}

private async processDirectory(
path: string
): Promise<Promise<GetContentResponse>[]> {
try {
const files = await this.fetchRepoFiles(path);
return this.getCurrDirFilesPromises(files);
} catch (error) {
this.handleError(`Failed to process directory: ${path}, ${error}`);
return Promise.reject(error);
}
}

Expand Down

0 comments on commit c8caf3d

Please sign in to comment.