diff --git a/.gitignore b/.gitignore index dd63103..2fe8824 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ node_modules dist dist-ssr *.local +types # Editor directories and files .vscode/* diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..3397da9 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,10 @@ +{ + "overrides": [ + { + "files": ["tests/**/*.ts"], + "options": { + "printWidth": 10000000 + } + } + ] +} diff --git a/package.json b/package.json index be5ba15..e42d9ce 100644 --- a/package.json +++ b/package.json @@ -1,30 +1,33 @@ { "name": "@huggingface/tokenizers", "version": "0.0.1", - "description": "", - "main": "dist/tokenizers.cjs", + "description": "🤗 Tokenizers.js: A pure JS/TS implementation of today's most used tokenizers", "type": "module", - "module": "dist/tokenizers.mjs", - "types": "dist/index.d.ts", + "main": "dist/tokenizers.min.mjs", + "browser": "dist/tokenizers.min.mjs", + "module": "dist/tokenizers.min.mjs", + "types": "types/index.d.ts", "exports": { ".": { - "types": "./dist/index.d.ts", + "types": "./types/index.d.ts", "node": { - "require": "./dist/tokenizers.cjs", - "import": "./dist/tokenizers.mjs" + "require": "./dist/tokenizers.min.cjs", + "import": "./dist/tokenizers.min.mjs" }, "browser": { - "import": "./dist/tokenizers.mjs" + "import": "./dist/tokenizers.min.mjs" }, - "default": "./dist/tokenizers.mjs" + "default": "./dist/tokenizers.min.mjs" } }, "files": [ "dist", - "README.md" + "types", + "README.md", + "LICENSE" ], "scripts": { - "clean": "rimraf dist", + "clean": "rimraf dist types", "build": "npm run clean && node scripts/build.mjs", "dev": "npm run clean && node scripts/dev.mjs", "lint": "eslint src --ext .ts,.tsx", diff --git a/scripts/build.mjs b/scripts/build.mjs index 756efa1..1467f35 100644 --- a/scripts/build.mjs +++ b/scripts/build.mjs @@ -1,4 +1,4 @@ -import { build } from "esbuild"; +import { build as esbuild } from "esbuild"; import { execSync } from "node:child_process"; import { readFileSync } from "node:fs"; import { gzipSync } from "node:zlib"; @@ -6,17 +6,6 @@ import { gzipSync } from "node:zlib"; console.log("Generating TypeScript declarations..."); execSync("tsc -p tsconfig.build.json", { stdio: "inherit" }); -const config = { - bundle: true, - minify: true, - minifySyntax: true, - treeShaking: true, - logLevel: "silent", - entryPoints: ["src/index.ts"], - platform: "neutral", - metafile: true, -}; - const formatSize = (bytes) => { if (bytes < 1024) return `${bytes}b`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}kb`; @@ -34,16 +23,27 @@ const reportSize = (outfile) => { console.log(`⚡ Done\n`); }; -await build({ - ...config, - format: "esm", - outfile: "dist/tokenizers.mjs", -}); -reportSize("dist/tokenizers.mjs"); +const build = async (outfile) => { + const format = outfile.endsWith(".mjs") ? "esm" : "cjs"; + const minifyOptions = /\.min\.[cm]js$/.test(outfile) + ? { minify: true, minifySyntax: true } + : {}; + + await esbuild({ + bundle: true, + treeShaking: true, + logLevel: "silent", + entryPoints: ["src/index.ts"], + platform: "neutral", + metafile: true, + format, + outfile, + ...minifyOptions, + }); + reportSize(outfile); +} -await build({ - ...config, - format: "cjs", - outfile: "dist/tokenizers.cjs", -}); -reportSize("dist/tokenizers.cjs"); +await build("dist/tokenizers.mjs"); +await build("dist/tokenizers.cjs"); +await build("dist/tokenizers.min.mjs"); +await build("dist/tokenizers.min.cjs"); diff --git a/tests/bundle.test.ts b/tests/bundle.test.ts new file mode 100644 index 0000000..2b2e30c --- /dev/null +++ b/tests/bundle.test.ts @@ -0,0 +1,49 @@ +import { spawnSync } from "child_process"; + +const IMPORT = `{ Tokenizer }`; +const MODULE_NAME = "@huggingface/tokenizers"; + +const CODE_BODY = ` +const modelId = "hf-internal-testing/tiny-random-LlamaForCausalLM"; +const tokenizerJson = await fetch(\`https://huggingface.co/\${modelId}/resolve/main/tokenizer.json\`).then(res => res.json()); +const tokenizerConfig = await fetch(\`https://huggingface.co/\${modelId}/resolve/main/tokenizer_config.json\`).then(res => res.json()); + +// Create tokenizer +const tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig); + +// Tokenize text +const tokens = tokenizer.tokenize('Hello World'); +const encoded = tokenizer.encode('Hello World'); +const decoded = tokenizer.decode(encoded); + +console.log(tokens); +console.log(encoded); +console.log(decoded); +`; + +const TARGET_OUTPUT = "[ '▁Hello', '▁World' ]\n[ 1, 15043, 2787 ]\n Hello World\n"; + +const wrap_async_iife = (code: string) => `(async function() { ${code} })();`; + +const check = (code: string, module = false) => { + const args = ["-e", code]; + if (module) args.push("--input-type=module"); + const { status, stdout, stderr } = spawnSync("node", args); + expect(stderr.toString()).toEqual(""); // No warnings or errors are printed + expect(stdout.toString()).toEqual(TARGET_OUTPUT); // The output should match + expect(status).toEqual(0); // The process should exit cleanly +}; + +describe("Testing the bundle", () => { + it("ECMAScript Module (ESM)", () => { + check(`import ${IMPORT} from "${MODULE_NAME}";${CODE_BODY}`, true); + }); + + it("CommonJS (CJS) with require", () => { + check(`const ${IMPORT} = require("${MODULE_NAME}");${wrap_async_iife(CODE_BODY)}`); + }); + + it("CommonJS (CJS) with dynamic import", () => { + check(`${wrap_async_iife(`const ${IMPORT} = await import("${MODULE_NAME}");${CODE_BODY}`)}`); + }); +}); diff --git a/tsconfig.build.json b/tsconfig.build.json index 5e5e21b..b5f81a3 100644 --- a/tsconfig.build.json +++ b/tsconfig.build.json @@ -3,10 +3,10 @@ "extends": "./tsconfig.json", "compilerOptions": { "declaration": true, - "declarationDir": "dist", + "declarationDir": "types", "emitDeclarationOnly": true, - "outDir": "dist", + "outDir": "types", "noEmit": false }, - "include": ["src/index.ts", "types"] + "include": ["src/index.ts"] } diff --git a/tsconfig.json b/tsconfig.json index e2b5a10..4bdf9fd 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -8,8 +8,8 @@ "esModuleInterop": true, "skipLibCheck": true, "declaration": true, - "declarationDir": "dist", - "outDir": "dist", + "declarationDir": "types", + "outDir": "types", "strict": true, "sourceMap": true, "strictNullChecks": false, @@ -21,5 +21,5 @@ "@static/*": ["src/static/*"] } }, - "include": ["src/**/*", "types"] + "include": ["src/**/*"] }