Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 9b2c8bb

Browse files
authored
chore: update models settings (#673)
1 parent f380642 commit 9b2c8bb

File tree

10 files changed

+317
-55
lines changed

10 files changed

+317
-55
lines changed

cortex-js/src/domain/models/model.interface.ts

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
export interface ModelArtifact {
2-
mmproj?: string;
3-
llama_model_path?: string;
4-
}
5-
61
/**
72
* Model type defines the shape of a model object.
83
* @stored
@@ -90,6 +85,56 @@ export interface Model {
9085
*/
9186
cpu_threads?: number;
9287

88+
/**
89+
* The prompt to use for internal configuration
90+
*/
91+
pre_prompt?: string;
92+
93+
/**
94+
* The batch size for prompt eval step
95+
*/
96+
n_batch?: number;
97+
98+
/**
99+
* To enable prompt caching or not
100+
*/
101+
caching_enabled?: boolean;
102+
103+
/**
104+
* Group attention factor in self-extend
105+
*/
106+
grp_attn_n?: number;
107+
108+
/**
109+
* Group attention width in self-extend
110+
*/
111+
grp_attn_w?: number;
112+
113+
/**
114+
* Prevent system swapping of the model to disk in macOS
115+
*/
116+
mlock?: boolean;
117+
118+
/**
119+
* You can constrain the sampling using GBNF grammars by providing path to a grammar file
120+
*/
121+
grammar_file?: string;
122+
123+
/**
124+
* To enable Flash Attention, default is true
125+
*/
126+
flash_attn?: boolean;
127+
128+
/**
129+
* KV cache type: f16, q8_0, q4_0, default is f16
130+
*/
131+
cache_type?: string;
132+
133+
/**
134+
* To enable mmap, default is true
135+
*/
136+
use_mmap?: boolean;
137+
93138
/**
94139
* The model engine.
95140
*/
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
112157
llama_model_path?: string;
113158
mmproj?: string;
114159
cont_batching?: boolean;
115-
vision_model?: boolean;
116-
text_model?: boolean;
117160
engine?: string;
118161
stop?: string[];
162+
pre_prompt?: string;
163+
n_batch?: number;
164+
caching_enabled?: boolean;
165+
grp_attn_n?: number;
166+
grp_attn_w?: number;
167+
mlock?: boolean;
168+
grammar_file?: string;
169+
model_type?: string;
170+
model_alias?: string;
171+
flash_attn?: boolean;
172+
cache_type?: string;
173+
use_mmap?: boolean;
119174
}
120175

121176
/**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
133188
presence_penalty?: number;
134189
engine?: string;
135190
}
191+
192+
/**
193+
* The model artifact object.
194+
* In-case the model files is not a raw file list
195+
*/
196+
export interface ModelArtifact {
197+
mmproj?: string;
198+
llama_model_path?: string;
199+
}

cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ import {
1818
OPEN_CHAT_3_5_JINJA,
1919
ZEPHYR,
2020
ZEPHYR_JINJA,
21-
} from '../../constants/prompt-constants';
21+
} from './../../constants/prompt-constants';
22+
import {
23+
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
24+
HUGGING_FACE_REPO_MODEL_API_URL,
25+
HUGGING_FACE_REPO_URL,
26+
HUGGING_FACE_TREE_REF_URL,
27+
} from '../../constants/huggingface';
2228
import { ModelTokenizer } from '../types/model-tokenizer.interface';
2329
import { HttpService } from '@nestjs/axios';
2430
import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
2935
import { load } from 'js-yaml';
3036
import { existsSync, readdirSync, readFileSync } from 'fs';
3137
import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
32-
import {
33-
HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
34-
HUGGING_FACE_REPO_MODEL_API_URL,
35-
HUGGING_FACE_REPO_URL,
36-
HUGGING_FACE_TREE_REF_URL,
37-
} from '../../constants/huggingface';
3838

3939
@Injectable()
4040
export class ModelsCliUsecases {

cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1-
import { Injectable } from '@nestjs/common';
2-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
1+
import { HttpStatus, Injectable } from '@nestjs/common';
2+
import {
3+
CORTEX_CPP_MODELS_URL,
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
7+
import { HttpService } from '@nestjs/axios';
8+
import { firstValueFrom } from 'rxjs';
39

410
export interface ModelStat {
511
modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
1521
}
1622
@Injectable()
1723
export class PSCliUsecases {
24+
constructor(private readonly httpService: HttpService) {}
1825
/**
1926
* Get models running in the Cortex C++ server
2027
* @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
2532
port: number = defaultCortexCppPort,
2633
): Promise<ModelStat[]> {
2734
return new Promise<ModelStat[]>((resolve, reject) =>
28-
fetch(`http://${host}:${port}/inferences/server/models`)
35+
firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
2936
.then((res) => {
30-
if (res.ok) {
31-
res
32-
.json()
33-
.then(({ data }: ModelStatResponse) => {
34-
if (data && Array.isArray(data) && data.length > 0) {
35-
resolve(
36-
data.map((e) => {
37-
const startTime = e.start_time ?? new Date();
38-
const currentTime = new Date();
39-
const duration =
40-
currentTime.getTime() - new Date(startTime).getTime();
41-
return {
42-
modelId: e.id,
43-
engine: e.engine ?? 'cortex.llamacpp',
44-
status: 'running',
45-
duration: this.formatDuration(duration),
46-
ram: e.ram ?? '-',
47-
vram: e.vram ?? '-',
48-
};
49-
}),
50-
);
51-
} else reject();
52-
})
53-
.catch(reject);
37+
const data = res.data as ModelStatResponse;
38+
if (
39+
res.status === HttpStatus.OK &&
40+
data &&
41+
Array.isArray(data.data) &&
42+
data.data.length > 0
43+
) {
44+
resolve(
45+
data.data.map((e) => {
46+
const startTime = e.start_time ?? new Date();
47+
const currentTime = new Date();
48+
const duration =
49+
currentTime.getTime() - new Date(startTime).getTime();
50+
return {
51+
modelId: e.id,
52+
engine: e.engine ?? 'cortex.llamacpp',
53+
status: 'running',
54+
duration: this.formatDuration(duration),
55+
ram: e.ram ?? '-',
56+
vram: e.vram ?? '-',
57+
};
58+
}),
59+
);
5460
} else reject();
5561
})
5662
.catch(reject),

cortex-js/src/infrastructure/constants/cortex.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
2323
port: number = defaultCortexCppPort,
2424
) => `http://${host}:${port}/healthz`;
2525

26+
export const CORTEX_CPP_MODELS_URL = (
27+
host: string = defaultCortexCppHost,
28+
port: number = defaultCortexCppPort,
29+
) => `http://${host}:${port}/inferences/server/models`;
30+
2631
// INITIALIZATION
2732
export const CORTEX_RELEASES_URL =
2833
'https://api.github.com/repos/janhq/cortex/releases';

cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import {
22
IsArray,
33
IsBoolean,
44
IsNumber,
5+
IsOptional,
56
IsString,
67
ValidateNested,
78
} from 'class-validator';
@@ -29,46 +30,53 @@ export class CreateChatCompletionDto {
2930
description:
3031
'Determines the format for output generation. If set to `true`, the output is generated continuously, allowing for real-time streaming of responses. If set to `false`, the output is delivered in a single JSON file.',
3132
})
33+
@IsOptional()
3234
@IsBoolean()
33-
stream: boolean;
35+
stream?: boolean;
3436

3537
@ApiProperty({
3638
description:
3739
'Sets the upper limit on the number of tokens the model can generate in a single output.',
3840
})
41+
@IsOptional()
3942
@IsNumber()
40-
max_tokens: number;
43+
max_tokens?: number;
4144

4245
@ApiProperty({
4346
description:
4447
'Defines specific tokens or phrases that signal the model to stop producing further output.',
4548
})
49+
@IsOptional()
4650
@IsArray()
47-
stop: string[];
51+
stop?: string[];
4852

4953
@ApiProperty({
5054
description:
5155
'Modifies the likelihood of the model repeating the same words or phrases within a single output.',
5256
})
57+
@IsOptional()
5358
@IsNumber()
54-
frequency_penalty: number;
59+
frequency_penalty?: number;
5560

5661
@ApiProperty({
5762
description:
5863
'Reduces the likelihood of repeating tokens, promoting novelty in the output.',
5964
})
65+
@IsOptional()
6066
@IsNumber()
61-
presence_penalty: number;
67+
presence_penalty?: number;
6268

6369
@ApiProperty({
6470
description: "Influences the randomness of the model's output.",
6571
})
72+
@IsOptional()
6673
@IsNumber()
67-
temperature: number;
74+
temperature?: number;
6875

6976
@ApiProperty({
7077
description: 'Sets probability threshold for more relevant outputs.',
7178
})
79+
@IsOptional()
7280
@IsNumber()
73-
top_p: number;
81+
top_p?: number;
7482
}

cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import { ApiProperty } from '@nestjs/swagger';
22
import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
3-
import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
3+
import {
4+
defaultCortexCppHost,
5+
defaultCortexCppPort,
6+
} from '@/infrastructure/constants/cortex';
47

58
export class StartCortexDto {
69
@ApiProperty({

0 commit comments

Comments
 (0)