feat: add engine pull support for tensorrt-llm (#765)

louis-jan · web-flow · commit ec21daec26dd · 2024-06-25T18:32:14.000+07:00
diff --git a/cortex-js/src/infrastructure/commanders/models/model-start.command.ts b/cortex-js/src/infrastructure/commanders/models/model-start.command.ts
@@ -13,6 +13,7 @@ import { InitCliUsecases } from '../usecases/init.cli.usecases';
 import { existsSync } from 'node:fs';
 import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
 import { join } from 'node:path';
+import { Engines } from '../types/engine.interface';
 
 type ModelStartOptions = {
   attach: boolean;
@@ -71,7 +72,7 @@ export class ModelStartCommand extends CommandRunner {
         engine,
       );
     }
-    if (engine === 'cortex.onnx' && process.platform !== 'win32') {
+    if (engine === Engines.onnx && process.platform !== 'win32') {
       console.error('The ONNX engine does not support this OS yet.');
       process.exit(1);
     }
diff --git a/cortex-js/src/infrastructure/commanders/types/engine.interface.ts b/cortex-js/src/infrastructure/commanders/types/engine.interface.ts
@@ -0,0 +1,5 @@
+export enum Engines {
+  llamaCPP = 'cortex.llamacpp',
+  onnx = 'cortex.onnx',
+  tensorrtLLM = 'cortex.tensorrt-llm',
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts
@@ -63,6 +63,8 @@ export class ChatCliUsecases {
     rl.on('line', sendCompletionMessage.bind(this));
 
     async function sendCompletionMessage(userInput: string) {
+      if (!userInput || userInput.trim() === '') return;
+
       if (userInput.trim() === this.exitClause) {
         rl.close();
         return;
@@ -98,12 +100,7 @@ export class ChatCliUsecases {
         model: modelId,
         stream: true,
         max_tokens: 4098,
-        stop: [],
-        frequency_penalty: 0.7,
-        presence_penalty: 0.7,
         temperature: 0.7,
-        top_p: 0.7,
-
         // Override with model settings
         ...parser.parseModelInferenceParams(model),
       };
diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts
@@ -12,11 +12,12 @@ import { rm } from 'fs/promises';
 import { exec } from 'child_process';
 import { appPath } from '@/utils/app-path';
 import {
-  CORTEX_ONNX_ENGINE_RELEASES_URL,
+  CORTEX_ENGINE_RELEASES_URL,
   CORTEX_RELEASES_URL,
   CUDA_DOWNLOAD_URL,
 } from '@/infrastructure/constants/cortex';
 import { checkNvidiaGPUExist, cudaVersion } from '@/utils/cuda';
+import { Engines } from '../types/engine.interface';
 
 @Injectable()
 export class InitCliUsecases {
@@ -70,13 +71,14 @@ export class InitCliUsecases {
     )
       await this.installLlamaCppEngine(options, version);
 
-    if (engine === 'cortex.onnx' && process.platform === 'win32')
-      await this.installONNXEngine();
-    else if (engine === 'cortex.onnx' && process.platform !== 'win32') {
+    if (engine === Engines.onnx && process.platform !== 'win32') {
       console.error('The ONNX engine does not support this OS yet.');
       process.exit(1);
     }
 
+    if (engine !== 'cortex.llamacpp')
+      await this.installAcceleratedEngine('latest', engine);
+
     configs.initialized = true;
     await this.fileManagerService.writeConfigFile(configs);
   };
@@ -305,17 +307,17 @@ export class InitCliUsecases {
   };
 
   /**
-   * Download and install ONNX engine
+   * Download and install accelerated engine
    * @param version
    * @param engineFileName
    */
-  private async installONNXEngine(
+  private async installAcceleratedEngine(
     version: string = 'latest',
-    engineFileName: string = 'windows-amd64',
+    engine: string = Engines.onnx,
   ) {
     const res = await firstValueFrom(
       this.httpService.get(
-        CORTEX_ONNX_ENGINE_RELEASES_URL +
+        CORTEX_ENGINE_RELEASES_URL(engine) +
           `${version === 'latest' ? '/latest' : ''}`,
         {
           headers: {
@@ -338,15 +340,17 @@ export class InitCliUsecases {
       );
     }
     const toDownloadAsset = release.assets.find((s: any) =>
-      s.name.includes(engineFileName),
+      s.name.includes(process.platform === 'win32' ? 'windows' : 'linux'),
     );
 
     if (!toDownloadAsset) {
-      console.log(`Could not find engine file ${engineFileName}`);
+      console.log(
+        `Could not find engine file for platform ${process.platform}`,
+      );
       exit(1);
     }
 
-    console.log(`Downloading ONNX engine file ${engineFileName}`);
+    console.log(`Downloading engine file ${toDownloadAsset.name}`);
     const dataFolderPath = await this.fileManagerService.getDataFolderPath();
     const engineDir = join(dataFolderPath, 'cortex-cpp');
 
@@ -397,10 +401,10 @@ export class InitCliUsecases {
     await rm(destination, { force: true });
 
     // Copy the additional files to the cortex-cpp directory
-    for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) {
+    for (const file of readdirSync(join(engineDir, 'engines', engine))) {
       if (file !== 'engine.dll') {
         await cpSync(
-          join(engineDir, 'engines', 'cortex.onnx', file),
+          join(engineDir, 'engines', engine, file),
           join(engineDir, file),
         );
       }
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -16,9 +16,10 @@ import { join, basename } from 'path';
 import { load } from 'js-yaml';
 import { existsSync, readdirSync, readFileSync } from 'fs';
 import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id';
-import { getHFModelMetadata } from '@/utils/huggingface';
+import { fetchJanRepoData, getHFModelMetadata } from '@/utils/huggingface';
 import { createWriteStream, mkdirSync, promises } from 'node:fs';
 import { firstValueFrom } from 'rxjs';
+import { Engines } from '../types/engine.interface';
 
 @Injectable()
 export class ModelsCliUsecases {
@@ -120,8 +121,8 @@ export class ModelsCliUsecases {
       process.exit(1);
     }
 
-    if (modelId.includes('onnx')) {
-      await this.pullOnnxModel(modelId);
+    if (modelId.includes('onnx') || modelId.includes('tensorrt')) {
+      await this.pullEngineModelFiles(modelId);
     } else {
       await this.pullGGUFModel(modelId);
       const bar = new SingleBar({}, Presets.shades_classic);
@@ -151,10 +152,10 @@ export class ModelsCliUsecases {
   }
 
   /**
-   * It's to pull ONNX model from HuggingFace repository
+   * It's to pull engine model files from HuggingFace repository
    * @param modelId
    */
-  private async pullOnnxModel(modelId: string) {
+  private async pullEngineModelFiles(modelId: string) {
     const modelsContainerDir = await this.fileService.getModelsPath();
 
     if (!existsSync(modelsContainerDir)) {
@@ -164,35 +165,22 @@ export class ModelsCliUsecases {
     const modelFolder = join(modelsContainerDir, normalizeModelId(modelId));
     await promises.mkdir(modelFolder, { recursive: true }).catch(() => {});
 
-    const files = [
-      'genai_config.json',
-      'model.onnx',
-      'model.onnx.data',
-      'model.yml',
-      'special_tokens_map.json',
-      'tokenizer.json',
-      'tokenizer_config.json',
-    ];
-    const repo = modelId.split(':')[0];
-    const branch = modelId.split(':')[1] || 'default';
+    const files = (await fetchJanRepoData(modelId)).siblings;
     for (const file of files) {
-      console.log(`Downloading ${file}`);
+      console.log(`Downloading ${file.rfilename}`);
       const bar = new SingleBar({}, Presets.shades_classic);
       bar.start(100, 0);
       const response = await firstValueFrom(
-        this.httpService.get(
-          `https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`,
-          {
-            responseType: 'stream',
-          },
-        ),
+        this.httpService.get(file.downloadUrl ?? '', {
+          responseType: 'stream',
+        }),
       );
       if (!response) {
         throw new Error('Failed to download model');
       }
 
       await new Promise((resolve, reject) => {
-        const writer = createWriteStream(join(modelFolder, file));
+        const writer = createWriteStream(join(modelFolder, file.rfilename));
         let receivedBytes = 0;
         const totalBytes = response.headers['content-length'];
 
@@ -281,7 +269,7 @@ export class ModelsCliUsecases {
       // Default Model Settings
       ctx_len: 4096,
       ngl: 100,
-      engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
+      engine: Engines.llamaCPP,
     };
     if (!(await this.modelsUsecases.findOne(modelId)))
       await this.modelsUsecases.create(model);
diff --git a/cortex-js/src/infrastructure/constants/benchmark.ts b/cortex-js/src/infrastructure/constants/benchmark.ts
@@ -18,7 +18,6 @@ export const defaultBenchmarkConfiguration: BenchmarkConfig = {
       model: 'tinyllama',
       stream: true,
       max_tokens: 2048,
-      stop: [],
       frequency_penalty: 0,
       presence_penalty: 0,
       temperature: 0.7,
diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts
@@ -42,8 +42,8 @@ export const CORTEX_JS_STOP_API_SERVER_URL = (
 export const CORTEX_RELEASES_URL =
   'https://api.github.com/repos/janhq/cortex/releases';
 
-export const CORTEX_ONNX_ENGINE_RELEASES_URL =
-  'https://api.github.com/repos/janhq/cortex.onnx/releases';
+export const CORTEX_ENGINE_RELEASES_URL = (engine: string) =>
+  `https://api.github.com/repos/janhq/${engine}/releases`;
 
 export const CUDA_DOWNLOAD_URL =
   'https://catalog.jan.ai/dist/cuda-dependencies/<version>/<platform>/cuda.tar.gz';
diff --git a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts
@@ -7,6 +7,7 @@ import { EngineExtension } from '@/domain/abstracts/engine.abstract';
 import { appPath } from '@/utils/app-path';
 import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
 import { existsSync } from 'fs';
+import { Engines } from '@/infrastructure/commanders/types/engine.interface';
 
 @Injectable()
 export class ExtensionRepositoryImpl implements ExtensionRepository {
@@ -18,9 +19,9 @@ export class ExtensionRepositoryImpl implements ExtensionRepository {
     private readonly cortexProvider: EngineExtension,
     private readonly fileService: FileManagerService,
   ) {
-    this.extensions.set('cortex.llamacpp', this.cortexProvider);
-    this.extensions.set('cortex.onnx', this.cortexProvider);
-    this.extensions.set('cortex.tensorrt-llm', this.cortexProvider);
+    this.extensions.set(Engines.llamaCPP, this.cortexProvider);
+    this.extensions.set(Engines.onnx, this.cortexProvider);
+    this.extensions.set(Engines.tensorrtLLM, this.cortexProvider);
     this.loadCoreExtensions();
     this.loadExternalExtensions();
   }
diff --git a/cortex-js/src/usecases/models/models.usecases.ts b/cortex-js/src/usecases/models/models.usecases.ts
@@ -40,6 +40,7 @@ import { EventEmitter2 } from '@nestjs/event-emitter';
 import { ModelEvent, ModelId, ModelStatus } from '@/domain/models/model.event';
 import { DownloadManagerService } from '@/infrastructure/services/download-manager/download-manager.service';
 import { ContextService } from '@/infrastructure/services/context/context.service';
+import { Engines } from '@/infrastructure/commanders/types/engine.interface';
 
 @Injectable()
 export class ModelsUsecases {
@@ -466,7 +467,7 @@ export class ModelsUsecases {
       // Default Model Settings
       ctx_len: 4096,
       ngl: 100,
-      engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
+      engine: Engines.llamaCPP,
     };
     if (!(await this.findOne(modelId))) await this.create(model);
   }
diff --git a/cortex-js/src/utils/cuda.ts b/cortex-js/src/utils/cuda.ts
@@ -3,6 +3,13 @@ import { existsSync } from 'fs';
 import { delimiter } from 'path';
 import { checkFileExistenceInPaths } from './app-path';
 
+export type GpuSettingInfo = {
+  id: string;
+  vram: string;
+  name: string;
+  arch?: string;
+};
+
 /**
  * Return the CUDA version installed on the system
  * @returns CUDA Version 11 | 12
@@ -63,3 +70,46 @@ export const checkNvidiaGPUExist = (): Promise<boolean> => {
     });
   });
 };
+
+/**
+ * Get GPU information from the system
+ * @returns GPU information
+ */
+export const getGpuInfo = async (): Promise<GpuSettingInfo[]> =>
+  new Promise((resolve) => {
+    exec(
+      'nvidia-smi --query-gpu=index,memory.total,name --format=csv,noheader,nounits',
+      async (error, stdout) => {
+        if (!error) {
+          // Get GPU info and gpu has higher memory first
+          let highestVram = 0;
+          let highestVramId = '0';
+          const gpus: GpuSettingInfo[] = stdout
+            .trim()
+            .split('\n')
+            .map((line) => {
+              let [id, vram, name] = line.split(', ');
+              const arch = getGpuArch(name);
+              vram = vram.replace(/\r/g, '');
+              if (parseFloat(vram) > highestVram) {
+                highestVram = parseFloat(vram);
+                highestVramId = id;
+              }
+              return { id, vram, name, arch };
+            });
+
+          resolve(gpus);
+        } else {
+          resolve([]);
+        }
+      },
+    );
+  });
+
+const getGpuArch = (gpuName: string): string => {
+  if (!gpuName.toLowerCase().includes('nvidia')) return 'unknown';
+
+  if (gpuName.includes('30')) return 'ampere';
+  else if (gpuName.includes('40')) return 'ada';
+  else return 'unknown';
+};
diff --git a/cortex-js/src/utils/huggingface.ts b/cortex-js/src/utils/huggingface.ts
@@ -20,6 +20,7 @@ import {
 } from '@/infrastructure/constants/prompt-constants';
 import { gguf } from '@huggingface/gguf';
 import axios from 'axios';
+import { parseModelHubEngineBranch } from './normalize-model-id';
 
 // TODO: move this to somewhere else, should be reused by API as well. Maybe in a separate service / provider?
 export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string {
@@ -64,7 +65,6 @@ export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string {
 export async function fetchHuggingFaceRepoData(
   repoId: string,
 ): Promise<HuggingFaceRepoData> {
-
   const sanitizedUrl = getRepoModelsUrl(repoId);
 
   const { data: response } = await axios.get(sanitizedUrl);
@@ -113,7 +113,7 @@ export async function fetchJanRepoData(
   modelId: string,
 ): Promise<HuggingFaceRepoData> {
   const repo = modelId.split(':')[0];
-  const tree = modelId.split(':')[1] ?? 'default';
+  const tree = await parseModelHubEngineBranch(modelId.split(':')[1] ?? 'default');
   const url = getRepoModelsUrl(`cortexhub/${repo}`, tree);
 
   const res = await fetch(url);
@@ -164,8 +164,6 @@ export async function fetchJanRepoData(
 
   data.modelUrl = url;
 
-  
-  
   return data;
 }
 
diff --git a/cortex-js/src/utils/normalize-model-id.ts b/cortex-js/src/utils/normalize-model-id.ts