Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 3dd6d0a

Browse files
committed
chore: update models settings
1 parent f380642 commit 3dd6d0a

File tree

9 files changed

+294
-47
lines changed

9 files changed

+294
-47
lines changed

cortex-js/src/domain/models/model.interface.ts

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
export interface ModelArtifact {
2-
mmproj?: string;
3-
llama_model_path?: string;
4-
}
5-
61
/**
72
* Model type defines the shape of a model object.
83
* @stored
@@ -90,6 +85,56 @@ export interface Model {
9085
*/
9186
cpu_threads?: number;
9287

88+
/**
89+
* The prompt to use for internal configuration
90+
*/
91+
pre_prompt?: string;
92+
93+
/**
94+
* The batch size for prompt eval step
95+
*/
96+
n_batch?: number;
97+
98+
/**
99+
* To enable prompt caching or not
100+
*/
101+
caching_enabled?: boolean;
102+
103+
/**
104+
* Group attention factor in self-extend
105+
*/
106+
grp_attn_n?: number;
107+
108+
/**
109+
* Group attention width in self-extend
110+
*/
111+
grp_attn_w?: number;
112+
113+
/**
114+
* Prevent system swapping of the model to disk in macOS
115+
*/
116+
mlock?: boolean;
117+
118+
/**
119+
* You can constrain the sampling using GBNF grammars by providing path to a grammar file
120+
*/
121+
grammar_file?: string;
122+
123+
/**
124+
* To enable Flash Attention, default is true
125+
*/
126+
flash_attn?: boolean;
127+
128+
/**
129+
* KV cache type: f16, q8_0, q4_0, default is f16
130+
*/
131+
cache_type?: string;
132+
133+
/**
134+
* To enable mmap, default is true
135+
*/
136+
use_mmap?: boolean;
137+
93138
/**
94139
* The model engine.
95140
*/
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
112157
llama_model_path?: string;
113158
mmproj?: string;
114159
cont_batching?: boolean;
115-
vision_model?: boolean;
116-
text_model?: boolean;
117160
engine?: string;
118161
stop?: string[];
162+
pre_prompt?: string;
163+
n_batch?: number;
164+
caching_enabled?: boolean;
165+
grp_attn_n?: number;
166+
grp_attn_w?: number;
167+
mlock?: boolean;
168+
grammar_file?: string;
169+
model_type?: string;
170+
model_alias?: string;
171+
flash_attn?: boolean;
172+
cache_type?: string;
173+
use_mmap?: boolean;
119174
}
120175

121176
/**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
133188
presence_penalty?: number;
134189
engine?: string;
135190
}
191+
192+
/**
193+
* The model artifact object.
194+
* In-case the model files is not a raw file list
195+
*/
196+
export interface ModelArtifact {
197+
mmproj?: string;
198+
llama_model_path?: string;
199+
}

cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ import {
1818
OPEN_CHAT_3_5_JINJA,
1919
ZEPHYR,
2020
ZEPHYR_JINJA,
21-
} from '../../constants/prompt-constants';
21+
} from './../../constants/prompt-constants';
22+
import {
23+
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
24+
HUGGING_FACE_REPO_MODEL_API_URL,
25+
HUGGING_FACE_REPO_URL,
26+
HUGGING_FACE_TREE_REF_URL,
27+
} from '../../constants/huggingface';
2228
import { ModelTokenizer } from '../types/model-tokenizer.interface';
2329
import { HttpService } from '@nestjs/axios';
2430
import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
2935
import { load } from 'js-yaml';
3036
import { existsSync, readdirSync, readFileSync } from 'fs';
3137
import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
32-
import {
33-
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
34-
HUGGING_FACE_REPO_MODEL_API_URL,
35-
HUGGING_FACE_REPO_URL,
36-
HUGGING_FACE_TREE_REF_URL,
37-
} from '../../constants/huggingface';
3838

3939
@Injectable()
4040
export class ModelsCliUsecases {

cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1-
import { Injectable } from '@nestjs/common';
2-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
1+
import { HttpStatus, Injectable } from '@nestjs/common';
2+
import {
3+
CORTEX_CPP_MODELS_URL,
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
7+
import { HttpService } from '@nestjs/axios';
8+
import { firstValueFrom } from 'rxjs';
39

410
export interface ModelStat {
511
modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
1521
}
1622
@Injectable()
1723
export class PSCliUsecases {
24+
constructor(private readonly httpService: HttpService) {}
1825
/**
1926
* Get models running in the Cortex C++ server
2027
* @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
2532
port: number = defaultCortexCppPort,
2633
): Promise<ModelStat[]> {
2734
return new Promise<ModelStat[]>((resolve, reject) =>
28-
fetch(`http://${host}:${port}/inferences/server/models`)
35+
firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
2936
.then((res) => {
30-
if (res.ok) {
31-
res
32-
.json()
33-
.then(({ data }: ModelStatResponse) => {
34-
if (data && Array.isArray(data) && data.length > 0) {
35-
resolve(
36-
data.map((e) => {
37-
const startTime = e.start_time ?? new Date();
38-
const currentTime = new Date();
39-
const duration =
40-
currentTime.getTime() - new Date(startTime).getTime();
41-
return {
42-
modelId: e.id,
43-
engine: e.engine ?? 'cortex.llamacpp',
44-
status: 'running',
45-
duration: this.formatDuration(duration),
46-
ram: e.ram ?? '-',
47-
vram: e.vram ?? '-',
48-
};
49-
}),
50-
);
51-
} else reject();
52-
})
53-
.catch(reject);
37+
const data = res.data as ModelStatResponse;
38+
if (
39+
res.status === HttpStatus.OK &&
40+
data &&
41+
Array.isArray(data.data) &&
42+
data.data.length > 0
43+
) {
44+
resolve(
45+
data.data.map((e) => {
46+
const startTime = e.start_time ?? new Date();
47+
const currentTime = new Date();
48+
const duration =
49+
currentTime.getTime() - new Date(startTime).getTime();
50+
return {
51+
modelId: e.id,
52+
engine: e.engine ?? 'cortex.llamacpp',
53+
status: 'running',
54+
duration: this.formatDuration(duration),
55+
ram: e.ram ?? '-',
56+
vram: e.vram ?? '-',
57+
};
58+
}),
59+
);
5460
} else reject();
5561
})
5662
.catch(reject),

cortex-js/src/infrastructure/constants/cortex.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
2323
port: number = defaultCortexCppPort,
2424
) => `http://${host}:${port}/healthz`;
2525

26+
export const CORTEX_CPP_MODELS_URL = (
27+
host: string = defaultCortexCppHost,
28+
port: number = defaultCortexCppPort,
29+
) => `http://${host}:${port}/inferences/server/models`;
30+
2631
// INITIALIZATION
2732
export const CORTEX_RELEASES_URL =
2833
'https://api.github.com/repos/janhq/cortex/releases';

cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import { ApiProperty } from '@nestjs/swagger';
22
import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
3-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
3+
import {
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
47

58
export class StartCortexDto {
69
@ApiProperty({

cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,85 @@ export class ModelSettingsDto implements ModelSettingParams {
4747
@Min(1)
4848
cpu_threads?: number;
4949

50+
@ApiProperty({
51+
description: 'The prompt to use for internal configuration',
52+
})
53+
@IsOptional()
54+
@IsString()
55+
pre_prompt?: string;
56+
57+
@ApiProperty({
58+
description: 'The batch size for prompt eval step',
59+
example: 512,
60+
})
61+
@IsOptional()
62+
@IsNumber()
63+
n_batch?: number;
64+
65+
@ApiProperty({
66+
description: 'To enable prompt caching or not',
67+
example: true,
68+
})
69+
@IsOptional()
70+
@IsBoolean()
71+
caching_enabled?: boolean;
72+
73+
@ApiProperty({
74+
description: 'Group attention factor in self-extend',
75+
example: 1,
76+
})
77+
@IsOptional()
78+
@IsNumber()
79+
grp_attn_n?: number;
80+
81+
@ApiProperty({
82+
description: 'Group attention width in self-extend',
83+
example: 512,
84+
})
85+
@IsOptional()
86+
@IsNumber()
87+
grp_attn_w?: number;
88+
89+
@ApiProperty({
90+
description: 'Prevent system swapping of the model to disk in macOS',
91+
example: false,
92+
})
93+
@IsOptional()
94+
@IsBoolean()
95+
mlock?: boolean;
96+
97+
@ApiProperty({
98+
description:
99+
'You can constrain the sampling using GBNF grammars by providing path to a grammar file',
100+
})
101+
@IsOptional()
102+
@IsString()
103+
grammar_file?: string;
104+
105+
@ApiProperty({
106+
description: 'To enable Flash Attention, default is true',
107+
example: true,
108+
})
109+
@IsOptional()
110+
@IsBoolean()
111+
flash_attn?: boolean;
112+
113+
@ApiProperty({
114+
description: 'KV cache type: f16, q8_0, q4_0, default is f16',
115+
example: 'f16',
116+
})
117+
@IsOptional()
118+
@IsString()
119+
cache_type?: string;
120+
121+
@ApiProperty({
122+
description: 'To enable mmap, default is true',
123+
example: true,
124+
})
125+
@IsOptional()
126+
@IsBoolean()
127+
use_mmap?: boolean;
128+
50129
@ApiProperty({
51130
example: 'cortex.llamacpp',
52131
description: 'The engine to use.',

0 commit comments

Comments
 (0)