Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ node_modules
dist
dist-ssr
*.local
types

# Editor directories and files
.vscode/*
Expand Down
10 changes: 10 additions & 0 deletions .prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"overrides": [
{
"files": ["tests/**/*.ts"],
"options": {
"printWidth": 10000000
}
}
]
}
25 changes: 14 additions & 11 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
{
"name": "@huggingface/tokenizers",
"version": "0.0.1",
"description": "",
"main": "dist/tokenizers.cjs",
"description": "🤗 Tokenizers.js: A pure JS/TS implementation of today's most used tokenizers",
"type": "module",
"module": "dist/tokenizers.mjs",
"types": "dist/index.d.ts",
"main": "dist/tokenizers.min.mjs",
"browser": "dist/tokenizers.min.mjs",
"module": "dist/tokenizers.min.mjs",
"types": "types/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"types": "./types/index.d.ts",
"node": {
"require": "./dist/tokenizers.cjs",
"import": "./dist/tokenizers.mjs"
"require": "./dist/tokenizers.min.cjs",
"import": "./dist/tokenizers.min.mjs"
},
"browser": {
"import": "./dist/tokenizers.mjs"
"import": "./dist/tokenizers.min.mjs"
},
"default": "./dist/tokenizers.mjs"
"default": "./dist/tokenizers.min.mjs"
}
},
"files": [
"dist",
"README.md"
"types",
"README.md",
"LICENSE"
],
"scripts": {
"clean": "rimraf dist",
"clean": "rimraf dist types",
"build": "npm run clean && node scripts/build.mjs",
"dev": "npm run clean && node scripts/dev.mjs",
"lint": "eslint src --ext .ts,.tsx",
Expand Down
48 changes: 24 additions & 24 deletions scripts/build.mjs
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
import { build } from "esbuild";
import { build as esbuild } from "esbuild";
import { execSync } from "node:child_process";
import { readFileSync } from "node:fs";
import { gzipSync } from "node:zlib";

console.log("Generating TypeScript declarations...");
execSync("tsc -p tsconfig.build.json", { stdio: "inherit" });

const config = {
bundle: true,
minify: true,
minifySyntax: true,
treeShaking: true,
logLevel: "silent",
entryPoints: ["src/index.ts"],
platform: "neutral",
metafile: true,
};

const formatSize = (bytes) => {
if (bytes < 1024) return `${bytes}b`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}kb`;
Expand All @@ -34,16 +23,27 @@ const reportSize = (outfile) => {
console.log(`⚡ Done\n`);
};

await build({
...config,
format: "esm",
outfile: "dist/tokenizers.mjs",
});
reportSize("dist/tokenizers.mjs");
const build = async (outfile) => {
const format = outfile.endsWith(".mjs") ? "esm" : "cjs";
const minifyOptions = /\.min\.[cm]js$/.test(outfile)
? { minify: true, minifySyntax: true }
: {};

await esbuild({
bundle: true,
treeShaking: true,
logLevel: "silent",
entryPoints: ["src/index.ts"],
platform: "neutral",
metafile: true,
format,
outfile,
...minifyOptions,
});
reportSize(outfile);
}

await build({
...config,
format: "cjs",
outfile: "dist/tokenizers.cjs",
});
reportSize("dist/tokenizers.cjs");
await build("dist/tokenizers.mjs");
await build("dist/tokenizers.cjs");
await build("dist/tokenizers.min.mjs");
await build("dist/tokenizers.min.cjs");
49 changes: 49 additions & 0 deletions tests/bundle.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { spawnSync } from "child_process";

const IMPORT = `{ Tokenizer }`;
const MODULE_NAME = "@huggingface/tokenizers";

const CODE_BODY = `
const modelId = "hf-internal-testing/tiny-random-LlamaForCausalLM";
const tokenizerJson = await fetch(\`https://huggingface.co/\${modelId}/resolve/main/tokenizer.json\`).then(res => res.json());
const tokenizerConfig = await fetch(\`https://huggingface.co/\${modelId}/resolve/main/tokenizer_config.json\`).then(res => res.json());

// Create tokenizer
const tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig);

// Tokenize text
const tokens = tokenizer.tokenize('Hello World');
const encoded = tokenizer.encode('Hello World');
const decoded = tokenizer.decode(encoded);

console.log(tokens);
console.log(encoded);
console.log(decoded);
`;

const TARGET_OUTPUT = "[ '▁Hello', '▁World' ]\n[ 1, 15043, 2787 ]\n<s> Hello World\n";

const wrap_async_iife = (code: string) => `(async function() { ${code} })();`;

const check = (code: string, module = false) => {
const args = ["-e", code];
if (module) args.push("--input-type=module");
const { status, stdout, stderr } = spawnSync("node", args);
expect(stderr.toString()).toEqual(""); // No warnings or errors are printed
expect(stdout.toString()).toEqual(TARGET_OUTPUT); // The output should match
expect(status).toEqual(0); // The process should exit cleanly
};

describe("Testing the bundle", () => {
it("ECMAScript Module (ESM)", () => {
check(`import ${IMPORT} from "${MODULE_NAME}";${CODE_BODY}`, true);
});

it("CommonJS (CJS) with require", () => {
check(`const ${IMPORT} = require("${MODULE_NAME}");${wrap_async_iife(CODE_BODY)}`);
});

it("CommonJS (CJS) with dynamic import", () => {
check(`${wrap_async_iife(`const ${IMPORT} = await import("${MODULE_NAME}");${CODE_BODY}`)}`);
});
});
6 changes: 3 additions & 3 deletions tsconfig.build.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
"extends": "./tsconfig.json",
"compilerOptions": {
"declaration": true,
"declarationDir": "dist",
"declarationDir": "types",
"emitDeclarationOnly": true,
"outDir": "dist",
"outDir": "types",
"noEmit": false
},
"include": ["src/index.ts", "types"]
"include": ["src/index.ts"]
}
6 changes: 3 additions & 3 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"esModuleInterop": true,
"skipLibCheck": true,
"declaration": true,
"declarationDir": "dist",
"outDir": "dist",
"declarationDir": "types",
"outDir": "types",
"strict": true,
"sourceMap": true,
"strictNullChecks": false,
Expand All @@ -21,5 +21,5 @@
"@static/*": ["src/static/*"]
}
},
"include": ["src/**/*", "types"]
"include": ["src/**/*"]
}