chore: update models settings

louis-jan · louis-jan · commit 3dd6d0a911dd · 2024-06-10T15:17:59.000+07:00
diff --git a/cortex-js/src/domain/models/model.interface.ts b/cortex-js/src/domain/models/model.interface.ts
@@ -1,8 +1,3 @@
-export interface ModelArtifact {
-  mmproj?: string;
-  llama_model_path?: string;
-}
-
 /**
  * Model type defines the shape of a model object.
  * @stored
@@ -90,6 +85,56 @@ export interface Model {
    */
   cpu_threads?: number;
 
+  /**
+   * The prompt to use for internal configuration
+   */
+  pre_prompt?: string;
+
+  /**
+   * The batch size for prompt eval step
+   */
+  n_batch?: number;
+
+  /**
+   * To enable prompt caching or not
+   */
+  caching_enabled?: boolean;
+
+  /**
+   * Group attention factor in self-extend
+   */
+  grp_attn_n?: number;
+
+  /**
+   * Group attention width in self-extend
+   */
+  grp_attn_w?: number;
+
+  /**
+   * Prevent system swapping of the model to disk in macOS
+   */
+  mlock?: boolean;
+
+  /**
+   * You can constrain the sampling using GBNF grammars by providing path to a grammar file
+   */
+  grammar_file?: string;
+
+  /**
+   * To enable Flash Attention, default is true
+   */
+  flash_attn?: boolean;
+
+  /**
+   * KV cache type: f16, q8_0, q4_0, default is f16
+   */
+  cache_type?: string;
+
+  /**
+   * To enable mmap, default is true
+   */
+  use_mmap?: boolean;
+
   /**
    * The model engine.
    */
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
   llama_model_path?: string;
   mmproj?: string;
   cont_batching?: boolean;
-  vision_model?: boolean;
-  text_model?: boolean;
   engine?: string;
   stop?: string[];
+  pre_prompt?: string;
+  n_batch?: number;
+  caching_enabled?: boolean;
+  grp_attn_n?: number;
+  grp_attn_w?: number;
+  mlock?: boolean;
+  grammar_file?: string;
+  model_type?: string;
+  model_alias?: string;
+  flash_attn?: boolean;
+  cache_type?: string;
+  use_mmap?: boolean;
 }
 
 /**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
   presence_penalty?: number;
   engine?: string;
 }
+
+/**
+ * The model artifact object.
+ * In-case the model files is not a raw file list
+ */
+export interface ModelArtifact {
+  mmproj?: string;
+  llama_model_path?: string;
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -18,7 +18,13 @@ import {
   OPEN_CHAT_3_5_JINJA,
   ZEPHYR,
   ZEPHYR_JINJA,
-} from '../../constants/prompt-constants';
+} from './../../constants/prompt-constants';
+import {
+  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
+  HUGGING_FACE_REPO_MODEL_API_URL,
+  HUGGING_FACE_REPO_URL,
+  HUGGING_FACE_TREE_REF_URL,
+} from '../../constants/huggingface';
 import { ModelTokenizer } from '../types/model-tokenizer.interface';
 import { HttpService } from '@nestjs/axios';
 import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
 import { load } from 'js-yaml';
 import { existsSync, readdirSync, readFileSync } from 'fs';
 import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
-import {
-  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
-  HUGGING_FACE_REPO_MODEL_API_URL,
-  HUGGING_FACE_REPO_URL,
-  HUGGING_FACE_TREE_REF_URL,
-} from '../../constants/huggingface';
 
 @Injectable()
 export class ModelsCliUsecases {
diff --git a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts
@@ -1,5 +1,11 @@
-import { Injectable } from '@nestjs/common';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import { HttpStatus, Injectable } from '@nestjs/common';
+import {
+  CORTEX_CPP_MODELS_URL,
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
+import { HttpService } from '@nestjs/axios';
+import { firstValueFrom } from 'rxjs';
 
 export interface ModelStat {
   modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
 }
 @Injectable()
 export class PSCliUsecases {
+  constructor(private readonly httpService: HttpService) {}
   /**
    * Get models running in the Cortex C++ server
    * @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
     port: number = defaultCortexCppPort,
   ): Promise<ModelStat[]> {
     return new Promise<ModelStat[]>((resolve, reject) =>
-      fetch(`http://${host}:${port}/inferences/server/models`)
+      firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
         .then((res) => {
-          if (res.ok) {
-            res
-              .json()
-              .then(({ data }: ModelStatResponse) => {
-                if (data && Array.isArray(data) && data.length > 0) {
-                  resolve(
-                    data.map((e) => {
-                      const startTime = e.start_time ?? new Date();
-                      const currentTime = new Date();
-                      const duration =
-                        currentTime.getTime() - new Date(startTime).getTime();
-                      return {
-                        modelId: e.id,
-                        engine: e.engine ?? 'cortex.llamacpp',
-                        status: 'running',
-                        duration: this.formatDuration(duration),
-                        ram: e.ram ?? '-',
-                        vram: e.vram ?? '-',
-                      };
-                    }),
-                  );
-                } else reject();
-              })
-              .catch(reject);
+          const data = res.data as ModelStatResponse;
+          if (
+            res.status === HttpStatus.OK &&
+            data &&
+            Array.isArray(data.data) &&
+            data.data.length > 0
+          ) {
+            resolve(
+              data.data.map((e) => {
+                const startTime = e.start_time ?? new Date();
+                const currentTime = new Date();
+                const duration =
+                  currentTime.getTime() - new Date(startTime).getTime();
+                return {
+                  modelId: e.id,
+                  engine: e.engine ?? 'cortex.llamacpp',
+                  status: 'running',
+                  duration: this.formatDuration(duration),
+                  ram: e.ram ?? '-',
+                  vram: e.vram ?? '-',
+                };
+              }),
+            );
           } else reject();
         })
         .catch(reject),
diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
   port: number = defaultCortexCppPort,
 ) => `http://${host}:${port}/healthz`;
 
+export const CORTEX_CPP_MODELS_URL = (
+  host: string = defaultCortexCppHost,
+  port: number = defaultCortexCppPort,
+) => `http://${host}:${port}/inferences/server/models`;
+
 // INITIALIZATION
 export const CORTEX_RELEASES_URL =
   'https://api.github.com/repos/janhq/cortex/releases';
diff --git a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
@@ -1,6 +1,9 @@
 import { ApiProperty } from '@nestjs/swagger';
 import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import {
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
 
 export class StartCortexDto {
   @ApiProperty({
diff --git a/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts b/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts
@@ -47,6 +47,85 @@ export class ModelSettingsDto implements ModelSettingParams {
   @Min(1)
   cpu_threads?: number;
 
+  @ApiProperty({
+    description: 'The prompt to use for internal configuration',
+  })
+  @IsOptional()
+  @IsString()
+  pre_prompt?: string;
+
+  @ApiProperty({
+    description: 'The batch size for prompt eval step',
+    example: 512,
+  })
+  @IsOptional()
+  @IsNumber()
+  n_batch?: number;
+
+  @ApiProperty({
+    description: 'To enable prompt caching or not',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  caching_enabled?: boolean;
+
+  @ApiProperty({
+    description: 'Group attention factor in self-extend',
+    example: 1,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_n?: number;
+
+  @ApiProperty({
+    description: 'Group attention width in self-extend',
+    example: 512,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_w?: number;
+
+  @ApiProperty({
+    description: 'Prevent system swapping of the model to disk in macOS',
+    example: false,
+  })
+  @IsOptional()
+  @IsBoolean()
+  mlock?: boolean;
+
+  @ApiProperty({
+    description:
+      'You can constrain the sampling using GBNF grammars by providing path to a grammar file',
+  })
+  @IsOptional()
+  @IsString()
+  grammar_file?: string;
+
+  @ApiProperty({
+    description: 'To enable Flash Attention, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  flash_attn?: boolean;
+
+  @ApiProperty({
+    description: 'KV cache type: f16, q8_0, q4_0, default is f16',
+    example: 'f16',
+  })
+  @IsOptional()
+  @IsString()
+  cache_type?: string;
+
+  @ApiProperty({
+    description: 'To enable mmap, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  use_mmap?: boolean;
+
   @ApiProperty({
     example: 'cortex.llamacpp',
     description: 'The engine to use.',
diff --git a/cortex-js/src/infrastructure/dtos/models/model.dto.ts b/cortex-js/src/infrastructure/dtos/models/model.dto.ts
diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
diff --git a/cortex-js/src/usecases/threads/threads.usecases.ts b/cortex-js/src/usecases/threads/threads.usecases.ts