feat: ship ONNX runtime on Windows (#716)

louis-jan · web-flow · commit 5f57033af390 · 2024-06-18T14:55:39.000+07:00
diff --git a/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts b/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts
@@ -10,7 +10,8 @@ import { ModelNotFoundException } from '@/infrastructure/exception/model-not-fou
   aliases: ['download'],
   arguments: '<model_id>',
   argsDescription: { model_id: 'Model repo to pull' },
-  description: 'Download a model. Working with HuggingFace model id.',
+  description:
+    'Download a model from a registry. Working with HuggingFace repositories. For available models, please visit https://huggingface.co/cortexhub',
 })
 @SetCommandContext()
 export class ModelPullCommand extends CommandRunner {
diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts
@@ -1,4 +1,11 @@
-import { createWriteStream, existsSync, rmSync } from 'fs';
+import {
+  cpSync,
+  createWriteStream,
+  existsSync,
+  readdir,
+  readdirSync,
+  rmSync,
+} from 'fs';
 import { delimiter, join } from 'path';
 import { HttpService } from '@nestjs/axios';
 import { Presets, SingleBar } from 'cli-progress';
@@ -12,6 +19,7 @@ import { rm } from 'fs/promises';
 import { exec } from 'child_process';
 import { appPath } from '@/utils/app-path';
 import {
+  CORTEX_ONNX_ENGINE_RELEASES_URL,
   CORTEX_RELEASES_URL,
   CUDA_DOWNLOAD_URL,
 } from '@/infrastructure/constants/cortex';
@@ -59,7 +67,7 @@ export class InitCliUsecases {
       exit(1);
     }
 
-    console.log(`Downloading engine file ${engineFileName}`);
+    console.log(`Downloading Llama.cpp engine file ${engineFileName}`);
     const dataFolderPath = await this.fileManagerService.getDataFolderPath();
     const engineDir = join(dataFolderPath, 'cortex-cpp');
     if (existsSync(engineDir)) rmSync(engineDir, { recursive: true });
@@ -109,6 +117,9 @@ export class InitCliUsecases {
       exit(1);
     }
     await rm(destination, { force: true });
+
+    // Ship ONNX Runtime on Windows by default
+    if (process.platform === 'win32') await this.installONNXEngine();
   };
 
   parseEngineFileName = (options?: InitOptions) => {
@@ -187,6 +198,7 @@ export class InitCliUsecases {
     ).replace('<platform>', platform);
     const destination = join(dataFolderPath, 'cuda-toolkit.tar.gz');
 
+    console.log('Downloading CUDA Toolkit dependency...');
     const download = await firstValueFrom(
       this.httpService.get(url, {
         responseType: 'stream',
@@ -283,6 +295,109 @@ export class InitCliUsecases {
     });
   };
 
+  /**
+   * Download and install ONNX engine
+   * @param version
+   * @param engineFileName 
+   */
+  async installONNXEngine(
+    version: string = 'latest',
+    engineFileName: string = 'windows-amd64',
+  ) {
+    const res = await firstValueFrom(
+      this.httpService.get(
+        CORTEX_ONNX_ENGINE_RELEASES_URL +
+          `${version === 'latest' ? '/latest' : ''}`,
+        {
+          headers: {
+            'X-GitHub-Api-Version': '2022-11-28',
+            Accept: 'application/vnd.github+json',
+          },
+        },
+      ),
+    );
+
+    if (!res?.data) {
+      console.log('Failed to fetch releases');
+      exit(1);
+    }
+
+    let release = res?.data;
+    if (Array.isArray(res?.data)) {
+      release = Array(res?.data)[0].find(
+        (e) => e.name === version.replace('v', ''),
+      );
+    }
+    const toDownloadAsset = release.assets.find((s: any) =>
+      s.name.includes(engineFileName),
+    );
+
+    if (!toDownloadAsset) {
+      console.log(`Could not find engine file ${engineFileName}`);
+      exit(1);
+    }
+
+    console.log(`Downloading ONNX engine file ${engineFileName}`);
+    const dataFolderPath = await this.fileManagerService.getDataFolderPath();
+    const engineDir = join(dataFolderPath, 'cortex-cpp');
+
+    const download = await firstValueFrom(
+      this.httpService.get(toDownloadAsset.browser_download_url, {
+        responseType: 'stream',
+      }),
+    );
+    if (!download) {
+      console.log('Failed to download model');
+      process.exit(1);
+    }
+
+    const destination = join(dataFolderPath, toDownloadAsset.name);
+
+    await new Promise((resolve, reject) => {
+      const writer = createWriteStream(destination);
+      let receivedBytes = 0;
+      const totalBytes = download.headers['content-length'];
+
+      writer.on('finish', () => {
+        bar.stop();
+        resolve(true);
+      });
+
+      writer.on('error', (error) => {
+        bar.stop();
+        reject(error);
+      });
+
+      const bar = new SingleBar({}, Presets.shades_classic);
+      bar.start(100, 0);
+
+      download.data.on('data', (chunk: any) => {
+        receivedBytes += chunk.length;
+        bar.update(Math.floor((receivedBytes / totalBytes) * 100));
+      });
+
+      download.data.pipe(writer);
+    });
+
+    try {
+      await decompress(destination, join(engineDir, 'engines'));
+    } catch (e) {
+      console.error('Error decompressing file', e);
+      exit(1);
+    }
+    await rm(destination, { force: true });
+
+    // Copy the additional files to the cortex-cpp directory
+    for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) {
+      if (file !== 'engine.dll') {
+        await cpSync(
+          join(engineDir, 'engines', 'cortex.onnx', file),
+          join(engineDir, file),
+        );
+      }
+    }
+  }
+
   private checkFileExistenceInPaths = (
     file: string,
     paths: string[],
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -17,6 +17,8 @@ import { load } from 'js-yaml';
 import { existsSync, readdirSync, readFileSync } from 'fs';
 import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id';
 import { getHFModelMetadata } from '@/utils/huggingface';
+import { createWriteStream, mkdirSync, promises } from 'node:fs';
+import { firstValueFrom } from 'rxjs';
 
 @Injectable()
 export class ModelsCliUsecases {
@@ -118,40 +120,116 @@ export class ModelsCliUsecases {
       process.exit(1);
     }
 
-    await this.pullHuggingFaceModel(modelId);
-    const bar = new SingleBar({}, Presets.shades_classic);
-    bar.start(100, 0);
-    const callback = (progress: number) => {
-      bar.update(progress);
-    };
+    if (modelId.includes('onnx')) {
+      await this.pullOnnxModel(modelId);
+    } else {
+      await this.pullGGUFModel(modelId);
+      const bar = new SingleBar({}, Presets.shades_classic);
+      bar.start(100, 0);
+      const callback = (progress: number) => {
+        bar.update(progress);
+      };
+
+      try {
+        await this.modelsUsecases.downloadModel(modelId, callback);
+
+        const model = await this.modelsUsecases.findOne(modelId);
+        const fileUrl = join(
+          await this.fileService.getModelsPath(),
+          normalizeModelId(modelId),
+          basename((model?.files as string[])[0]),
+        );
+        await this.modelsUsecases.update(modelId, {
+          files: [fileUrl],
+          name: modelId.replace(':default', ''),
+        });
+      } catch (err) {
+        bar.stop();
+        throw err;
+      }
+    }
+  }
+
+  /**
+   * It's to pull ONNX model from HuggingFace repository
+   * @param modelId 
+   */
+  private async pullOnnxModel(modelId: string) {
+    const modelsContainerDir = await this.fileService.getModelsPath();
+
+    if (!existsSync(modelsContainerDir)) {
+      mkdirSync(modelsContainerDir, { recursive: true });
+    }
+
+    const modelFolder = join(modelsContainerDir, normalizeModelId(modelId));
+    await promises.mkdir(modelFolder, { recursive: true }).catch(() => {});
 
-    try {
-      await this.modelsUsecases.downloadModel(modelId, callback);
+    const files = [
+      'genai_config.json',
+      'model.onnx',
+      'model.onnx.data',
+      'model.yml',
+      'special_tokens_map.json',
+      'tokenizer.json',
+      'tokenizer_config.json',
+    ];
+    const repo = modelId.split(':')[0];
+    const branch = modelId.split(':')[1] || 'default';
+    for (const file of files) {
+      console.log(`Downloading ${file}`);
+      const bar = new SingleBar({}, Presets.shades_classic);
+      bar.start(100, 0);
 
-      const model = await this.modelsUsecases.findOne(modelId);
-      const fileUrl = join(
-        await this.fileService.getModelsPath(),
-        normalizeModelId(modelId),
-        basename((model?.files as string[])[0]),
+      const response = await firstValueFrom(
+        this.httpService.get(
+          `https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`,
+          {
+            responseType: 'stream',
+          },
+        ),
       );
-      await this.modelsUsecases.update(modelId, {
-        files: [fileUrl],
-        name: modelId.replace(':default', ''),
+      if (!response) {
+        throw new Error('Failed to download model');
+      }
+
+      await new Promise((resolve, reject) => {
+        const writer = createWriteStream(join(modelFolder, file));
+        let receivedBytes = 0;
+        const totalBytes = response.headers['content-length'];
+
+        writer.on('finish', () => {
+          resolve(true);
+        });
+
+        writer.on('error', (error) => {
+          reject(error);
+        });
+
+        response.data.on('data', (chunk: any) => {
+          receivedBytes += chunk.length;
+          bar.update(Math.floor((receivedBytes / totalBytes) * 100));
+        });
+
+        response.data.pipe(writer);
       });
-    } catch (err) {
       bar.stop();
-      throw err;
     }
-  }
 
-  //// PRIVATE METHODS ////
+    const model: CreateModelDto = load(
+      readFileSync(join(modelFolder, 'model.yml'), 'utf-8'),
+    ) as CreateModelDto;
+    model.files = [join(modelFolder)];
+    model.model = modelId
 
+    if (!(await this.modelsUsecases.findOne(modelId)))
+      await this.modelsUsecases.create(model);
+  }
   /**
    * It's to pull model from HuggingFace repository
    * It could be a model from Jan's repo or other authors
    * @param modelId HuggingFace model id. e.g. "janhq/llama-3 or llama3:7b"
    */
-  private async pullHuggingFaceModel(modelId: string) {
+  private async pullGGUFModel(modelId: string) {
     const data: HuggingFaceRepoData =
       await this.modelsUsecases.fetchModelMetadata(modelId);
 
@@ -179,6 +257,7 @@ export class ModelsCliUsecases {
     } else {
       modelVersion = data.siblings.find((e) => e.rfilename.includes('.gguf'));
     }
+
     if (!modelVersion) throw 'No expected quantization found';
     const metadata = await getHFModelMetadata(modelVersion.downloadUrl!);
 
@@ -203,12 +282,17 @@ export class ModelsCliUsecases {
       // Default Model Settings
       ctx_len: 4096,
       ngl: 100,
-      engine: 'cortex.llamacpp',
+      engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
     };
     if (!(await this.modelsUsecases.findOne(modelId)))
       await this.modelsUsecases.create(model);
   }
 
+  /**
+   * Parse preset file
+   * @param preset 
+   * @returns 
+   */
   private async parsePreset(preset?: string): Promise<object> {
     const presetsFolder = await this.fileService.getPresetsPath();
 
diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts
@@ -42,6 +42,9 @@ export const CORTEX_JS_STOP_API_SERVER_URL = (
 export const CORTEX_RELEASES_URL =
   'https://api.github.com/repos/janhq/cortex/releases';
 
+export const CORTEX_ONNX_ENGINE_RELEASES_URL =
+  'https://api.github.com/repos/janhq/cortex.onnx/releases';
+
 export const CUDA_DOWNLOAD_URL =
   'https://catalog.jan.ai/dist/cuda-dependencies/<version>/<platform>/cuda.tar.gz';
 
diff --git a/cortex-js/src/infrastructure/constants/huggingface.ts b/cortex-js/src/infrastructure/constants/huggingface.ts
@@ -2,7 +2,7 @@ export const HUGGING_FACE_TREE_REF_URL = (
   repo: string,
   tree: string,
   path: string,
-) => `https://huggingface.co/janhq/${repo}/resolve/${tree}/${path}`;
+) => `https://huggingface.co/cortexhub/${repo}/resolve/${tree}/${path}`;
 
 export const HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL = (
   author: string,
diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
@@ -15,7 +15,7 @@ import { FileManagerService } from '@/infrastructure/services/file-manager/file-
 
 @Injectable()
 export default class CortexProvider extends OAIEngineExtension {
-  provider: string = 'cortex.llamacpp';
+  provider: string = 'cortex';
   apiUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/chat_completion`;
 
   private loadModelUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/loadmodel`;
diff --git a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts
@@ -11,7 +11,10 @@ import { existsSync } from 'fs';
 @Injectable()
 export class ExtensionRepositoryImpl implements ExtensionRepository {
   // Initialize the Extensions Map with the key-value pairs of the core providers.
-  extensions = new Map<string, Extension>([['cortex', this.cortexProvider]]);
+  extensions = new Map<string, Extension>([
+    ['cortex.llamacpp', this.cortexProvider],
+    ['cortex.onnx', this.cortexProvider],
+  ]);
 
   constructor(
     @Inject('CORTEX_PROVIDER')
diff --git a/cortex-js/src/usecases/chat/chat.usecases.ts b/cortex-js/src/usecases/chat/chat.usecases.ts
diff --git a/cortex-js/src/usecases/models/models.usecases.ts b/cortex-js/src/usecases/models/models.usecases.ts
diff --git a/cortex-js/src/utils/huggingface.ts b/cortex-js/src/utils/huggingface.ts