Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit ea555ef

Browse files
committed
chore: update models settings
1 parent f380642 commit ea555ef

File tree

9 files changed

+302
-48
lines changed

9 files changed

+302
-48
lines changed

cortex-js/src/domain/models/model.interface.ts

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
export interface ModelArtifact {
2-
mmproj?: string;
3-
llama_model_path?: string;
4-
}
5-
61
/**
72
* Model type defines the shape of a model object.
83
* @stored
@@ -90,6 +85,56 @@ export interface Model {
9085
*/
9186
cpu_threads?: number;
9287

88+
/**
89+
* The prompt to use for internal configuration
90+
*/
91+
pre_prompt?: string;
92+
93+
/**
94+
* The batch size for prompt eval step
95+
*/
96+
n_batch?: number;
97+
98+
/**
99+
* To enable prompt caching or not
100+
*/
101+
caching_enabled?: boolean;
102+
103+
/**
104+
* Group attention factor in self-extend
105+
*/
106+
grp_attn_n?: number;
107+
108+
/**
109+
* Group attention width in self-extend
110+
*/
111+
grp_attn_w?: number;
112+
113+
/**
114+
* Prevent system swapping of the model to disk in macOS
115+
*/
116+
mlock?: boolean;
117+
118+
/**
119+
* You can constrain the sampling using GBNF grammars by providing path to a grammar file
120+
*/
121+
grammar_file?: string;
122+
123+
/**
124+
* To enable Flash Attention, default is true
125+
*/
126+
flash_attn?: boolean;
127+
128+
/**
129+
* KV cache type: f16, q8_0, q4_0, default is f16
130+
*/
131+
cache_type?: string;
132+
133+
/**
134+
* To enable mmap, default is true
135+
*/
136+
use_mmap?: boolean;
137+
93138
/**
94139
* The model engine.
95140
*/
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
112157
llama_model_path?: string;
113158
mmproj?: string;
114159
cont_batching?: boolean;
115-
vision_model?: boolean;
116-
text_model?: boolean;
117160
engine?: string;
118161
stop?: string[];
162+
pre_prompt?: string;
163+
n_batch?: number;
164+
caching_enabled?: boolean;
165+
grp_attn_n?: number;
166+
grp_attn_w?: number;
167+
mlock?: boolean;
168+
grammar_file?: string;
169+
model_type?: string;
170+
model_alias?: string;
171+
flash_attn?: boolean;
172+
cache_type?: string;
173+
use_mmap?: boolean;
119174
}
120175

121176
/**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
133188
presence_penalty?: number;
134189
engine?: string;
135190
}
191+
192+
/**
193+
* The model artifact object.
194+
* In-case the model files is not a raw file list
195+
*/
196+
export interface ModelArtifact {
197+
mmproj?: string;
198+
llama_model_path?: string;
199+
}

cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ import {
1818
OPEN_CHAT_3_5_JINJA,
1919
ZEPHYR,
2020
ZEPHYR_JINJA,
21-
} from '../../constants/prompt-constants';
21+
} from './../../constants/prompt-constants';
22+
import {
23+
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
24+
HUGGING_FACE_REPO_MODEL_API_URL,
25+
HUGGING_FACE_REPO_URL,
26+
HUGGING_FACE_TREE_REF_URL,
27+
} from '../../constants/huggingface';
2228
import { ModelTokenizer } from '../types/model-tokenizer.interface';
2329
import { HttpService } from '@nestjs/axios';
2430
import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
2935
import { load } from 'js-yaml';
3036
import { existsSync, readdirSync, readFileSync } from 'fs';
3137
import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
32-
import {
33-
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
34-
HUGGING_FACE_REPO_MODEL_API_URL,
35-
HUGGING_FACE_REPO_URL,
36-
HUGGING_FACE_TREE_REF_URL,
37-
} from '../../constants/huggingface';
3838

3939
@Injectable()
4040
export class ModelsCliUsecases {

cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1-
import { Injectable } from '@nestjs/common';
2-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
1+
import { HttpStatus, Injectable } from '@nestjs/common';
2+
import {
3+
CORTEX_CPP_MODELS_URL,
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
7+
import { HttpService } from '@nestjs/axios';
8+
import { firstValueFrom } from 'rxjs';
39

410
export interface ModelStat {
511
modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
1521
}
1622
@Injectable()
1723
export class PSCliUsecases {
24+
constructor(private readonly httpService: HttpService) {}
1825
/**
1926
* Get models running in the Cortex C++ server
2027
* @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
2532
port: number = defaultCortexCppPort,
2633
): Promise<ModelStat[]> {
2734
return new Promise<ModelStat[]>((resolve, reject) =>
28-
fetch(`http://${host}:${port}/inferences/server/models`)
35+
firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
2936
.then((res) => {
30-
if (res.ok) {
31-
res
32-
.json()
33-
.then(({ data }: ModelStatResponse) => {
34-
if (data && Array.isArray(data) && data.length > 0) {
35-
resolve(
36-
data.map((e) => {
37-
const startTime = e.start_time ?? new Date();
38-
const currentTime = new Date();
39-
const duration =
40-
currentTime.getTime() - new Date(startTime).getTime();
41-
return {
42-
modelId: e.id,
43-
engine: e.engine ?? 'cortex.llamacpp',
44-
status: 'running',
45-
duration: this.formatDuration(duration),
46-
ram: e.ram ?? '-',
47-
vram: e.vram ?? '-',
48-
};
49-
}),
50-
);
51-
} else reject();
52-
})
53-
.catch(reject);
37+
const data = res.data as ModelStatResponse;
38+
if (
39+
res.status === HttpStatus.OK &&
40+
data &&
41+
Array.isArray(data.data) &&
42+
data.data.length > 0
43+
) {
44+
resolve(
45+
data.data.map((e) => {
46+
const startTime = e.start_time ?? new Date();
47+
const currentTime = new Date();
48+
const duration =
49+
currentTime.getTime() - new Date(startTime).getTime();
50+
return {
51+
modelId: e.id,
52+
engine: e.engine ?? 'cortex.llamacpp',
53+
status: 'running',
54+
duration: this.formatDuration(duration),
55+
ram: e.ram ?? '-',
56+
vram: e.vram ?? '-',
57+
};
58+
}),
59+
);
5460
} else reject();
5561
})
5662
.catch(reject),

cortex-js/src/infrastructure/constants/cortex.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
2323
port: number = defaultCortexCppPort,
2424
) => `http://${host}:${port}/healthz`;
2525

26+
export const CORTEX_CPP_MODELS_URL = (
27+
host: string = defaultCortexCppHost,
28+
port: number = defaultCortexCppPort,
29+
) => `http://${host}:${port}/inferences/server/models`;
30+
2631
// INITIALIZATION
2732
export const CORTEX_RELEASES_URL =
2833
'https://api.github.com/repos/janhq/cortex/releases';

cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import { ApiProperty } from '@nestjs/swagger';
22
import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
3-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
3+
import {
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
47

58
export class StartCortexDto {
69
@ApiProperty({

cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import { ModelSettingParams } from '@/domain/models/model.interface';
22
import { ApiProperty } from '@nestjs/swagger';
3-
import { IsArray, IsNumber, IsOptional, Min } from 'class-validator';
3+
import {
4+
IsArray,
5+
IsBoolean,
6+
IsNumber,
7+
IsOptional,
8+
IsString,
9+
Min,
10+
} from 'class-validator';
411

512
export class ModelSettingsDto implements ModelSettingParams {
613
// Prompt Settings
@@ -47,6 +54,85 @@ export class ModelSettingsDto implements ModelSettingParams {
4754
@Min(1)
4855
cpu_threads?: number;
4956

57+
@ApiProperty({
58+
description: 'The prompt to use for internal configuration',
59+
})
60+
@IsOptional()
61+
@IsString()
62+
pre_prompt?: string;
63+
64+
@ApiProperty({
65+
description: 'The batch size for prompt eval step',
66+
example: 512,
67+
})
68+
@IsOptional()
69+
@IsNumber()
70+
n_batch?: number;
71+
72+
@ApiProperty({
73+
description: 'To enable prompt caching or not',
74+
example: true,
75+
})
76+
@IsOptional()
77+
@IsBoolean()
78+
caching_enabled?: boolean;
79+
80+
@ApiProperty({
81+
description: 'Group attention factor in self-extend',
82+
example: 1,
83+
})
84+
@IsOptional()
85+
@IsNumber()
86+
grp_attn_n?: number;
87+
88+
@ApiProperty({
89+
description: 'Group attention width in self-extend',
90+
example: 512,
91+
})
92+
@IsOptional()
93+
@IsNumber()
94+
grp_attn_w?: number;
95+
96+
@ApiProperty({
97+
description: 'Prevent system swapping of the model to disk in macOS',
98+
example: false,
99+
})
100+
@IsOptional()
101+
@IsBoolean()
102+
mlock?: boolean;
103+
104+
@ApiProperty({
105+
description:
106+
'You can constrain the sampling using GBNF grammars by providing path to a grammar file',
107+
})
108+
@IsOptional()
109+
@IsString()
110+
grammar_file?: string;
111+
112+
@ApiProperty({
113+
description: 'To enable Flash Attention, default is true',
114+
example: true,
115+
})
116+
@IsOptional()
117+
@IsBoolean()
118+
flash_attn?: boolean;
119+
120+
@ApiProperty({
121+
description: 'KV cache type: f16, q8_0, q4_0, default is f16',
122+
example: 'f16',
123+
})
124+
@IsOptional()
125+
@IsString()
126+
cache_type?: string;
127+
128+
@ApiProperty({
129+
description: 'To enable mmap, default is true',
130+
example: true,
131+
})
132+
@IsOptional()
133+
@IsBoolean()
134+
use_mmap?: boolean;
135+
50136
@ApiProperty({
51137
example: 'cortex.llamacpp',
52138
description: 'The engine to use.',

0 commit comments

Comments
 (0)