Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit ec21dae

Browse files
authored
feat: add engine pull support for tensorrt-llm (#765)
1 parent 1b769f4 commit ec21dae

File tree

12 files changed

+124
-55
lines changed

12 files changed

+124
-55
lines changed

cortex-js/src/infrastructure/commanders/models/model-start.command.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { InitCliUsecases } from '../usecases/init.cli.usecases';
1313
import { existsSync } from 'node:fs';
1414
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
1515
import { join } from 'node:path';
16+
import { Engines } from '../types/engine.interface';
1617

1718
type ModelStartOptions = {
1819
attach: boolean;
@@ -71,7 +72,7 @@ export class ModelStartCommand extends CommandRunner {
7172
engine,
7273
);
7374
}
74-
if (engine === 'cortex.onnx' && process.platform !== 'win32') {
75+
if (engine === Engines.onnx && process.platform !== 'win32') {
7576
console.error('The ONNX engine does not support this OS yet.');
7677
process.exit(1);
7778
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
export enum Engines {
2+
llamaCPP = 'cortex.llamacpp',
3+
onnx = 'cortex.onnx',
4+
tensorrtLLM = 'cortex.tensorrt-llm',
5+
}

cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ export class ChatCliUsecases {
6363
rl.on('line', sendCompletionMessage.bind(this));
6464

6565
async function sendCompletionMessage(userInput: string) {
66+
if (!userInput || userInput.trim() === '') return;
67+
6668
if (userInput.trim() === this.exitClause) {
6769
rl.close();
6870
return;
@@ -98,12 +100,7 @@ export class ChatCliUsecases {
98100
model: modelId,
99101
stream: true,
100102
max_tokens: 4098,
101-
stop: [],
102-
frequency_penalty: 0.7,
103-
presence_penalty: 0.7,
104103
temperature: 0.7,
105-
top_p: 0.7,
106-
107104
// Override with model settings
108105
...parser.parseModelInferenceParams(model),
109106
};

cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ import { rm } from 'fs/promises';
1212
import { exec } from 'child_process';
1313
import { appPath } from '@/utils/app-path';
1414
import {
15-
CORTEX_ONNX_ENGINE_RELEASES_URL,
15+
CORTEX_ENGINE_RELEASES_URL,
1616
CORTEX_RELEASES_URL,
1717
CUDA_DOWNLOAD_URL,
1818
} from '@/infrastructure/constants/cortex';
1919
import { checkNvidiaGPUExist, cudaVersion } from '@/utils/cuda';
20+
import { Engines } from '../types/engine.interface';
2021

2122
@Injectable()
2223
export class InitCliUsecases {
@@ -70,13 +71,14 @@ export class InitCliUsecases {
7071
)
7172
await this.installLlamaCppEngine(options, version);
7273

73-
if (engine === 'cortex.onnx' && process.platform === 'win32')
74-
await this.installONNXEngine();
75-
else if (engine === 'cortex.onnx' && process.platform !== 'win32') {
74+
if (engine === Engines.onnx && process.platform !== 'win32') {
7675
console.error('The ONNX engine does not support this OS yet.');
7776
process.exit(1);
7877
}
7978

79+
if (engine !== 'cortex.llamacpp')
80+
await this.installAcceleratedEngine('latest', engine);
81+
8082
configs.initialized = true;
8183
await this.fileManagerService.writeConfigFile(configs);
8284
};
@@ -305,17 +307,17 @@ export class InitCliUsecases {
305307
};
306308

307309
/**
308-
* Download and install ONNX engine
310+
* Download and install accelerated engine
309311
* @param version
310312
* @param engineFileName
311313
*/
312-
private async installONNXEngine(
314+
private async installAcceleratedEngine(
313315
version: string = 'latest',
314-
engineFileName: string = 'windows-amd64',
316+
engine: string = Engines.onnx,
315317
) {
316318
const res = await firstValueFrom(
317319
this.httpService.get(
318-
CORTEX_ONNX_ENGINE_RELEASES_URL +
320+
CORTEX_ENGINE_RELEASES_URL(engine) +
319321
`${version === 'latest' ? '/latest' : ''}`,
320322
{
321323
headers: {
@@ -338,15 +340,17 @@ export class InitCliUsecases {
338340
);
339341
}
340342
const toDownloadAsset = release.assets.find((s: any) =>
341-
s.name.includes(engineFileName),
343+
s.name.includes(process.platform === 'win32' ? 'windows' : 'linux'),
342344
);
343345

344346
if (!toDownloadAsset) {
345-
console.log(`Could not find engine file ${engineFileName}`);
347+
console.log(
348+
`Could not find engine file for platform ${process.platform}`,
349+
);
346350
exit(1);
347351
}
348352

349-
console.log(`Downloading ONNX engine file ${engineFileName}`);
353+
console.log(`Downloading engine file ${toDownloadAsset.name}`);
350354
const dataFolderPath = await this.fileManagerService.getDataFolderPath();
351355
const engineDir = join(dataFolderPath, 'cortex-cpp');
352356

@@ -397,10 +401,10 @@ export class InitCliUsecases {
397401
await rm(destination, { force: true });
398402

399403
// Copy the additional files to the cortex-cpp directory
400-
for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) {
404+
for (const file of readdirSync(join(engineDir, 'engines', engine))) {
401405
if (file !== 'engine.dll') {
402406
await cpSync(
403-
join(engineDir, 'engines', 'cortex.onnx', file),
407+
join(engineDir, 'engines', engine, file),
404408
join(engineDir, file),
405409
);
406410
}

cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ import { join, basename } from 'path';
1616
import { load } from 'js-yaml';
1717
import { existsSync, readdirSync, readFileSync } from 'fs';
1818
import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id';
19-
import { getHFModelMetadata } from '@/utils/huggingface';
19+
import { fetchJanRepoData, getHFModelMetadata } from '@/utils/huggingface';
2020
import { createWriteStream, mkdirSync, promises } from 'node:fs';
2121
import { firstValueFrom } from 'rxjs';
22+
import { Engines } from '../types/engine.interface';
2223

2324
@Injectable()
2425
export class ModelsCliUsecases {
@@ -120,8 +121,8 @@ export class ModelsCliUsecases {
120121
process.exit(1);
121122
}
122123

123-
if (modelId.includes('onnx')) {
124-
await this.pullOnnxModel(modelId);
124+
if (modelId.includes('onnx') || modelId.includes('tensorrt')) {
125+
await this.pullEngineModelFiles(modelId);
125126
} else {
126127
await this.pullGGUFModel(modelId);
127128
const bar = new SingleBar({}, Presets.shades_classic);
@@ -151,10 +152,10 @@ export class ModelsCliUsecases {
151152
}
152153

153154
/**
154-
* It's to pull ONNX model from HuggingFace repository
155+
* It's to pull engine model files from HuggingFace repository
155156
* @param modelId
156157
*/
157-
private async pullOnnxModel(modelId: string) {
158+
private async pullEngineModelFiles(modelId: string) {
158159
const modelsContainerDir = await this.fileService.getModelsPath();
159160

160161
if (!existsSync(modelsContainerDir)) {
@@ -164,35 +165,22 @@ export class ModelsCliUsecases {
164165
const modelFolder = join(modelsContainerDir, normalizeModelId(modelId));
165166
await promises.mkdir(modelFolder, { recursive: true }).catch(() => {});
166167

167-
const files = [
168-
'genai_config.json',
169-
'model.onnx',
170-
'model.onnx.data',
171-
'model.yml',
172-
'special_tokens_map.json',
173-
'tokenizer.json',
174-
'tokenizer_config.json',
175-
];
176-
const repo = modelId.split(':')[0];
177-
const branch = modelId.split(':')[1] || 'default';
168+
const files = (await fetchJanRepoData(modelId)).siblings;
178169
for (const file of files) {
179-
console.log(`Downloading ${file}`);
170+
console.log(`Downloading ${file.rfilename}`);
180171
const bar = new SingleBar({}, Presets.shades_classic);
181172
bar.start(100, 0);
182173
const response = await firstValueFrom(
183-
this.httpService.get(
184-
`https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`,
185-
{
186-
responseType: 'stream',
187-
},
188-
),
174+
this.httpService.get(file.downloadUrl ?? '', {
175+
responseType: 'stream',
176+
}),
189177
);
190178
if (!response) {
191179
throw new Error('Failed to download model');
192180
}
193181

194182
await new Promise((resolve, reject) => {
195-
const writer = createWriteStream(join(modelFolder, file));
183+
const writer = createWriteStream(join(modelFolder, file.rfilename));
196184
let receivedBytes = 0;
197185
const totalBytes = response.headers['content-length'];
198186

@@ -281,7 +269,7 @@ export class ModelsCliUsecases {
281269
// Default Model Settings
282270
ctx_len: 4096,
283271
ngl: 100,
284-
engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
272+
engine: Engines.llamaCPP,
285273
};
286274
if (!(await this.modelsUsecases.findOne(modelId)))
287275
await this.modelsUsecases.create(model);

cortex-js/src/infrastructure/constants/benchmark.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ export const defaultBenchmarkConfiguration: BenchmarkConfig = {
1818
model: 'tinyllama',
1919
stream: true,
2020
max_tokens: 2048,
21-
stop: [],
2221
frequency_penalty: 0,
2322
presence_penalty: 0,
2423
temperature: 0.7,

cortex-js/src/infrastructure/constants/cortex.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ export const CORTEX_JS_STOP_API_SERVER_URL = (
4242
export const CORTEX_RELEASES_URL =
4343
'https://api.github.com/repos/janhq/cortex/releases';
4444

45-
export const CORTEX_ONNX_ENGINE_RELEASES_URL =
46-
'https://api.github.com/repos/janhq/cortex.onnx/releases';
45+
export const CORTEX_ENGINE_RELEASES_URL = (engine: string) =>
46+
`https://api.github.com/repos/janhq/${engine}/releases`;
4747

4848
export const CUDA_DOWNLOAD_URL =
4949
'https://catalog.jan.ai/dist/cuda-dependencies/<version>/<platform>/cuda.tar.gz';

cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { EngineExtension } from '@/domain/abstracts/engine.abstract';
77
import { appPath } from '@/utils/app-path';
88
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
99
import { existsSync } from 'fs';
10+
import { Engines } from '@/infrastructure/commanders/types/engine.interface';
1011

1112
@Injectable()
1213
export class ExtensionRepositoryImpl implements ExtensionRepository {
@@ -18,9 +19,9 @@ export class ExtensionRepositoryImpl implements ExtensionRepository {
1819
private readonly cortexProvider: EngineExtension,
1920
private readonly fileService: FileManagerService,
2021
) {
21-
this.extensions.set('cortex.llamacpp', this.cortexProvider);
22-
this.extensions.set('cortex.onnx', this.cortexProvider);
23-
this.extensions.set('cortex.tensorrt-llm', this.cortexProvider);
22+
this.extensions.set(Engines.llamaCPP, this.cortexProvider);
23+
this.extensions.set(Engines.onnx, this.cortexProvider);
24+
this.extensions.set(Engines.tensorrtLLM, this.cortexProvider);
2425
this.loadCoreExtensions();
2526
this.loadExternalExtensions();
2627
}

cortex-js/src/usecases/models/models.usecases.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import { EventEmitter2 } from '@nestjs/event-emitter';
4040
import { ModelEvent, ModelId, ModelStatus } from '@/domain/models/model.event';
4141
import { DownloadManagerService } from '@/infrastructure/services/download-manager/download-manager.service';
4242
import { ContextService } from '@/infrastructure/services/context/context.service';
43+
import { Engines } from '@/infrastructure/commanders/types/engine.interface';
4344

4445
@Injectable()
4546
export class ModelsUsecases {
@@ -466,7 +467,7 @@ export class ModelsUsecases {
466467
// Default Model Settings
467468
ctx_len: 4096,
468469
ngl: 100,
469-
engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
470+
engine: Engines.llamaCPP,
470471
};
471472
if (!(await this.findOne(modelId))) await this.create(model);
472473
}

cortex-js/src/utils/cuda.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@ import { existsSync } from 'fs';
33
import { delimiter } from 'path';
44
import { checkFileExistenceInPaths } from './app-path';
55

6+
export type GpuSettingInfo = {
7+
id: string;
8+
vram: string;
9+
name: string;
10+
arch?: string;
11+
};
12+
613
/**
714
* Return the CUDA version installed on the system
815
* @returns CUDA Version 11 | 12
@@ -63,3 +70,46 @@ export const checkNvidiaGPUExist = (): Promise<boolean> => {
6370
});
6471
});
6572
};
73+
74+
/**
75+
* Get GPU information from the system
76+
* @returns GPU information
77+
*/
78+
export const getGpuInfo = async (): Promise<GpuSettingInfo[]> =>
79+
new Promise((resolve) => {
80+
exec(
81+
'nvidia-smi --query-gpu=index,memory.total,name --format=csv,noheader,nounits',
82+
async (error, stdout) => {
83+
if (!error) {
84+
// Get GPU info and gpu has higher memory first
85+
let highestVram = 0;
86+
let highestVramId = '0';
87+
const gpus: GpuSettingInfo[] = stdout
88+
.trim()
89+
.split('\n')
90+
.map((line) => {
91+
let [id, vram, name] = line.split(', ');
92+
const arch = getGpuArch(name);
93+
vram = vram.replace(/\r/g, '');
94+
if (parseFloat(vram) > highestVram) {
95+
highestVram = parseFloat(vram);
96+
highestVramId = id;
97+
}
98+
return { id, vram, name, arch };
99+
});
100+
101+
resolve(gpus);
102+
} else {
103+
resolve([]);
104+
}
105+
},
106+
);
107+
});
108+
109+
const getGpuArch = (gpuName: string): string => {
110+
if (!gpuName.toLowerCase().includes('nvidia')) return 'unknown';
111+
112+
if (gpuName.includes('30')) return 'ampere';
113+
else if (gpuName.includes('40')) return 'ada';
114+
else return 'unknown';
115+
};

0 commit comments

Comments
 (0)