diff --git a/packages/hub/package.json b/packages/hub/package.json index 7802e23899..14af8c6fdf 100644 --- a/packages/hub/package.json +++ b/packages/hub/package.json @@ -21,6 +21,7 @@ "./src/utils/sha256-node.ts": false, "./src/utils/FileBlob.ts": false, "./src/lib/cache-management.ts": false, + "./src/lib/download-file-to-cache-dir.ts": false, "./dist/index.js": "./dist/browser/index.js", "./dist/index.mjs": "./dist/browser/index.mjs" }, diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index aecbf271e1..98c3be5b40 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -16,12 +16,19 @@ function getHuggingFaceHubCache(): string { return process.env["HUGGINGFACE_HUB_CACHE"] ?? getDefaultCachePath(); } -function getHFHubCache(): string { +export function getHFHubCachePath(): string { return process.env["HF_HUB_CACHE"] ?? getHuggingFaceHubCache(); } const FILES_TO_IGNORE: string[] = [".DS_Store"]; +export const REPO_ID_SEPARATOR: string = "--"; + +export function getRepoFolderName({ name, type }: RepoId): string { + const parts = [`${type}s`, ...name.split("/")] + return parts.join(REPO_ID_SEPARATOR); +} + export interface CachedFileInfo { path: string; /** @@ -63,7 +70,7 @@ export interface HFCacheInfo { } export async function scanCacheDir(cacheDir: string | undefined = undefined): Promise { - if (!cacheDir) cacheDir = getHFHubCache(); + if (!cacheDir) cacheDir = getHFHubCachePath(); const s = await stat(cacheDir); if (!s.isDirectory()) { @@ -107,12 +114,12 @@ export async function scanCacheDir(cacheDir: string | undefined = undefined): Pr export async function scanCachedRepo(repoPath: string): Promise { // get the directory name const name = basename(repoPath); - if (!name.includes("--")) { + if (!name.includes(REPO_ID_SEPARATOR)) { throw new Error(`Repo path is not a valid HuggingFace cache directory: ${name}`); } // parse the repoId from directory name - const [type, ...remaining] = name.split("--"); + const [type, ...remaining] = name.split(REPO_ID_SEPARATOR); const repoType = parseRepoType(type); const repoId = remaining.join("/"); diff --git a/packages/hub/src/lib/download-file-to-cache-dir.spec.ts b/packages/hub/src/lib/download-file-to-cache-dir.spec.ts new file mode 100644 index 0000000000..05e2e6de9e --- /dev/null +++ b/packages/hub/src/lib/download-file-to-cache-dir.spec.ts @@ -0,0 +1,234 @@ +import { expect, test, describe, vi, beforeEach } from "vitest"; +import type { RepoDesignation, RepoId } from "../types/public"; +import { dirname, join } from "node:path"; +import { lstat, mkdir, stat, symlink, writeFile, rename } from "node:fs/promises"; +import { pathsInfo } from "./paths-info"; +import type { Stats } from "node:fs"; +import { getHFHubCachePath, getRepoFolderName } from "./cache-management"; +import { toRepoId } from "../utils/toRepoId"; +import { downloadFileToCacheDir } from "./download-file-to-cache-dir"; + +vi.mock('node:fs/promises', () => ({ + writeFile: vi.fn(), + rename: vi.fn(), + symlink: vi.fn(), + lstat: vi.fn(), + mkdir: vi.fn(), + stat: vi.fn() +})); + +vi.mock('./paths-info', () => ({ + pathsInfo: vi.fn(), +})); + +const DUMMY_REPO: RepoId = { + name: 'hello-world', + type: 'model', +}; + +const DUMMY_ETAG = "dummy-etag"; + +// utility test method to get blob file path +function _getBlobFile(params: { + repo: RepoDesignation; + etag: string; + cacheDir?: string, // default to {@link getHFHubCache} +}) { + return join(params.cacheDir ?? getHFHubCachePath(), getRepoFolderName(toRepoId(params.repo)), "blobs", params.etag); +} + +// utility test method to get snapshot file path +function _getSnapshotFile(params: { + repo: RepoDesignation; + path: string; + revision : string; + cacheDir?: string, // default to {@link getHFHubCache} +}) { + return join(params.cacheDir ?? getHFHubCachePath(), getRepoFolderName(toRepoId(params.repo)), "snapshots", params.revision, params.path); +} + +describe('downloadFileToCacheDir', () => { + const fetchMock: typeof fetch = vi.fn(); + beforeEach(() => { + vi.resetAllMocks(); + // mock 200 request + vi.mocked(fetchMock).mockResolvedValue({ + status: 200, + ok: true, + body: 'dummy-body' + } as unknown as Response); + + // prevent to use caching + vi.mocked(stat).mockRejectedValue(new Error('Do not exists')); + vi.mocked(lstat).mockRejectedValue(new Error('Do not exists')); + }); + + test('should throw an error if fileDownloadInfo return nothing', async () => { + await expect(async () => { + await downloadFileToCacheDir({ + repo: DUMMY_REPO, + path: '/README.md', + fetch: fetchMock, + }); + }).rejects.toThrowError('cannot get path info for /README.md'); + + expect(pathsInfo).toHaveBeenCalledWith(expect.objectContaining({ + repo: DUMMY_REPO, + paths: ['/README.md'], + fetch: fetchMock, + })); + }); + + test('existing symlinked and blob should not re-download it', async () => { + // ///snapshots/README.md + const expectPointer = _getSnapshotFile({ + repo: DUMMY_REPO, + path: '/README.md', + revision: "dd4bc8b21efa05ec961e3efc4ee5e3832a3679c7", + }); + // stat ensure a symlink and the pointed file exists + vi.mocked(stat).mockResolvedValue({} as Stats) // prevent default mocked reject + + const output = await downloadFileToCacheDir({ + repo: DUMMY_REPO, + path: '/README.md', + fetch: fetchMock, + revision: "dd4bc8b21efa05ec961e3efc4ee5e3832a3679c7", + }); + + expect(stat).toHaveBeenCalledOnce(); + // Get call argument for stat + const starArg = vi.mocked(stat).mock.calls[0][0]; + + expect(starArg).toBe(expectPointer) + expect(fetchMock).not.toHaveBeenCalledWith(); + + expect(output).toBe(expectPointer); + }); + + test('existing blob should only create the symlink', async () => { + // ///snapshots/README.md + const expectPointer = _getSnapshotFile({ + repo: DUMMY_REPO, + path: '/README.md', + revision: "dummy-commit-hash", + }); + // //blobs/ + const expectedBlob = _getBlobFile({ + repo: DUMMY_REPO, + etag: DUMMY_ETAG, + }); + + // mock existing blob only no symlink + vi.mocked(lstat).mockResolvedValue({} as Stats); + // mock pathsInfo resolve content + vi.mocked(pathsInfo).mockResolvedValue([{ + oid: DUMMY_ETAG, + size: 55, + path: 'README.md', + type: 'file', + lastCommit: { + date: new Date(), + id: 'dummy-commit-hash', + title: 'Commit msg', + }, + }]); + + const output = await downloadFileToCacheDir({ + repo: DUMMY_REPO, + path: '/README.md', + fetch: fetchMock, + }); + + expect(stat).not.toHaveBeenCalled(); + // should have check for the blob + expect(lstat).toHaveBeenCalled(); + expect(vi.mocked(lstat).mock.calls[0][0]).toBe(expectedBlob); + + // symlink should have been created + expect(symlink).toHaveBeenCalledOnce(); + // no download done + expect(fetchMock).not.toHaveBeenCalled(); + + expect(output).toBe(expectPointer); + }); + + test('expect resolve value to be the pointer path of downloaded file', async () => { + // ///snapshots/README.md + const expectPointer = _getSnapshotFile({ + repo: DUMMY_REPO, + path: '/README.md', + revision: "dummy-commit-hash", + }); + // //blobs/ + const expectedBlob = _getBlobFile({ + repo: DUMMY_REPO, + etag: DUMMY_ETAG, + }); + + vi.mocked(pathsInfo).mockResolvedValue([{ + oid: DUMMY_ETAG, + size: 55, + path: 'README.md', + type: 'file', + lastCommit: { + date: new Date(), + id: 'dummy-commit-hash', + title: 'Commit msg', + }, + }]); + + const output = await downloadFileToCacheDir({ + repo: DUMMY_REPO, + path: '/README.md', + fetch: fetchMock, + }); + + // expect blobs and snapshots folder to have been mkdir + expect(vi.mocked(mkdir).mock.calls[0][0]).toBe(dirname(expectedBlob)); + expect(vi.mocked(mkdir).mock.calls[1][0]).toBe(dirname(expectPointer)); + + expect(output).toBe(expectPointer); + }); + + test('should write fetch response to blob', async () => { + // ///snapshots/README.md + const expectPointer = _getSnapshotFile({ + repo: DUMMY_REPO, + path: '/README.md', + revision: "dummy-commit-hash", + }); + // //blobs/ + const expectedBlob = _getBlobFile({ + repo: DUMMY_REPO, + etag: DUMMY_ETAG, + }); + + // mock pathsInfo resolve content + vi.mocked(pathsInfo).mockResolvedValue([{ + oid: DUMMY_ETAG, + size: 55, + path: 'README.md', + type: 'file', + lastCommit: { + date: new Date(), + id: 'dummy-commit-hash', + title: 'Commit msg', + }, + }]); + + await downloadFileToCacheDir({ + repo: DUMMY_REPO, + path: '/README.md', + fetch: fetchMock, + }); + + const incomplete = `${expectedBlob}.incomplete`; + // 1. should write fetch#response#body to incomplete file + expect(writeFile).toHaveBeenCalledWith(incomplete, 'dummy-body'); + // 2. should rename the incomplete to the blob expected name + expect(rename).toHaveBeenCalledWith(incomplete, expectedBlob); + // 3. should create symlink pointing to blob + expect(symlink).toHaveBeenCalledWith(expectedBlob, expectPointer); + }); +}); \ No newline at end of file diff --git a/packages/hub/src/lib/download-file-to-cache-dir.ts b/packages/hub/src/lib/download-file-to-cache-dir.ts new file mode 100644 index 0000000000..72869f3075 --- /dev/null +++ b/packages/hub/src/lib/download-file-to-cache-dir.ts @@ -0,0 +1,129 @@ +import { getHFHubCachePath, getRepoFolderName } from "./cache-management"; +import { dirname, join } from "node:path"; +import { writeFile, rename, symlink, lstat, mkdir, stat } from "node:fs/promises"; +import type { CommitInfo, PathInfo } from "./paths-info"; +import { pathsInfo } from "./paths-info"; +import type { CredentialsParams, RepoDesignation } from "../types/public"; +import { toRepoId } from "../utils/toRepoId"; +import { downloadFile } from "./download-file"; + +export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$"); + +function getFilePointer(storageFolder: string, revision: string, relativeFilename: string): string { + const snapshotPath = join(storageFolder, "snapshots"); + return join(snapshotPath, revision, relativeFilename); +} + +/** + * handy method to check if a file exists, or the pointer of a symlinks exists + * @param path + * @param followSymlinks + */ +async function exists(path: string, followSymlinks?: boolean): Promise { + try { + if(followSymlinks) { + await stat(path); + } else { + await lstat(path); + } + return true; + } catch (err: unknown) { + return false; + } +} + +/** + * Download a given file if it's not already present in the local cache. + * @param params + * @return the symlink to the blob object + */ +export async function downloadFileToCacheDir( + params: { + repo: RepoDesignation; + path: string; + /** + * If true, will download the raw git file. + * + * For example, when calling on a file stored with Git LFS, the pointer file will be downloaded instead. + */ + raw?: boolean; + /** + * An optional Git revision id which can be a branch name, a tag, or a commit hash. + * + * @default "main" + */ + revision?: string; + hubUrl?: string; + cacheDir?: string, + /** + * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. + */ + fetch?: typeof fetch; + } & Partial +): Promise { + // get revision provided or default to main + const revision = params.revision ?? "main"; + const cacheDir = params.cacheDir ?? getHFHubCachePath(); + // get repo id + const repoId = toRepoId(params.repo); + // get storage folder + const storageFolder = join(cacheDir, getRepoFolderName(repoId)); + + let commitHash: string | undefined; + + // if user provides a commitHash as revision, and they already have the file on disk, shortcut everything. + if (REGEX_COMMIT_HASH.test(revision)) { + commitHash = revision; + const pointerPath = getFilePointer(storageFolder, revision, params.path); + if (await exists(pointerPath, true)) return pointerPath; + } + + const pathsInformation: (PathInfo & { lastCommit: CommitInfo })[] = await pathsInfo({ + ...params, + paths: [params.path], + revision: revision, + expand: true, + }); + if (!pathsInformation || pathsInformation.length !== 1) throw new Error(`cannot get path info for ${params.path}`); + + let etag: string; + if (pathsInformation[0].lfs) { + etag = pathsInformation[0].lfs.oid; // get the LFS pointed file oid + } else { + etag = pathsInformation[0].oid; // get the repo file if not a LFS pointer + } + + const pointerPath = getFilePointer(storageFolder, commitHash ?? pathsInformation[0].lastCommit.id, params.path); + const blobPath = join(storageFolder, "blobs", etag); + + // mkdir blob and pointer path parent directory + await mkdir(dirname(blobPath), { recursive: true }); + await mkdir(dirname(pointerPath), { recursive: true }); + + // We might already have the blob but not the pointer + // shortcut the download if needed + if (await exists(blobPath)) { + // create symlinks in snapshot folder to blob object + await symlink(blobPath, pointerPath); + return pointerPath; + } + + const incomplete = `${blobPath}.incomplete`; + console.debug(`Downloading ${params.path} to ${incomplete}`); + + const response: Response | null = await downloadFile({ + ...params, + revision: commitHash, + }); + + if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`); + + // @ts-expect-error resp.body is a Stream, but Stream in internal to node + await writeFile(incomplete, response.body); + + // rename .incomplete file to expect blob + await rename(incomplete, blobPath); + // create symlinks in snapshot folder to blob object + await symlink(blobPath, pointerPath); + return pointerPath; +} diff --git a/packages/hub/src/lib/download-file.spec.ts b/packages/hub/src/lib/download-file.spec.ts new file mode 100644 index 0000000000..f442f152a1 --- /dev/null +++ b/packages/hub/src/lib/download-file.spec.ts @@ -0,0 +1,65 @@ +import { expect, test, describe, vi } from "vitest"; +import { downloadFile } from "./download-file"; +import type { RepoId } from "../types/public"; + +const DUMMY_REPO: RepoId = { + name: 'hello-world', + type: 'model', +}; + +describe("downloadFile", () => { + test("hubUrl params should overwrite HUB_URL", async () => { + const fetchMock: typeof fetch = vi.fn(); + vi.mocked(fetchMock).mockResolvedValue({ + status: 200, + ok: true, + } as Response); + + await downloadFile({ + repo: DUMMY_REPO, + path: '/README.md', + hubUrl: 'http://dummy-hub', + fetch: fetchMock, + }); + + expect(fetchMock).toHaveBeenCalledWith('http://dummy-hub/hello-world/resolve/main//README.md', expect.anything()); + }); + + test("raw params should use raw url", async () => { + const fetchMock: typeof fetch = vi.fn(); + vi.mocked(fetchMock).mockResolvedValue({ + status: 200, + ok: true, + } as Response); + + await downloadFile({ + repo: DUMMY_REPO, + path: 'README.md', + raw: true, + fetch: fetchMock, + }); + + expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/hello-world/raw/main/README.md', expect.anything()); + }); + + test("internal server error should propagate the error", async () => { + const fetchMock: typeof fetch = vi.fn(); + vi.mocked(fetchMock).mockResolvedValue({ + status: 500, + ok: false, + headers: new Map([["Content-Type", "application/json"]]), + json: () => ({ + error: 'Dummy internal error', + }), + } as unknown as Response); + + await expect(async () => { + await downloadFile({ + repo: DUMMY_REPO, + path: 'README.md', + raw: true, + fetch: fetchMock, + }); + }).rejects.toThrowError('Dummy internal error'); + }); +}); \ No newline at end of file diff --git a/packages/hub/src/lib/index.ts b/packages/hub/src/lib/index.ts index b79385dc5f..c2a2fbe06c 100644 --- a/packages/hub/src/lib/index.ts +++ b/packages/hub/src/lib/index.ts @@ -8,6 +8,7 @@ export * from "./delete-file"; export * from "./delete-files"; export * from "./delete-repo"; export * from "./download-file"; +export * from "./download-file-to-cache-dir"; export * from "./file-download-info"; export * from "./file-exists"; export * from "./list-commits"; diff --git a/packages/hub/vitest-browser.config.mts b/packages/hub/vitest-browser.config.mts index e106a2fbaa..60fcbfbfcf 100644 --- a/packages/hub/vitest-browser.config.mts +++ b/packages/hub/vitest-browser.config.mts @@ -2,6 +2,6 @@ import { configDefaults, defineConfig } from "vitest/config"; export default defineConfig({ test: { - exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts", "src/lib/cache-management.spec.ts"], + exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts", "src/lib/cache-management.spec.ts", "src/lib/download-file-to-cache-dir.spec.ts"], }, });