Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions cortex-js/src/domain/models/huggingface.interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ export interface HuggingFaceModelVersion {
fileSize?: number;
quantization?: Quantization;
}

export interface HuggingFaceRepoSibling {
rfilename: string;
downloadUrl?: string;
fileSize?: number;
quantization?: Quantization;
}
export interface HuggingFaceRepoData {
id: string;
modelId: string;
Expand All @@ -18,12 +25,7 @@ export interface HuggingFaceRepoData {
pipeline_tag: 'text-generation';
tags: Array<'transformers' | 'pytorch' | 'safetensors' | string>;
cardData: Record<CardDataKeys | string, unknown>;
siblings: {
rfilename: string;
downloadUrl?: string;
fileSize?: number;
quantization?: Quantization;
}[];
siblings: HuggingFaceRepoSibling[];
createdAt: string;
}

Expand Down
2 changes: 2 additions & 0 deletions cortex-js/src/domain/models/model.event.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ const ModelLoadingEvents = [
'stopped',
'starting-failed',
'stopping-failed',
'model-downloaded',
'model-deleted',
] as const;
export type ModelLoadingEvent = (typeof ModelLoadingEvents)[number];

Expand Down
1 change: 1 addition & 0 deletions cortex-js/src/domain/models/model.interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,5 @@ export interface ModelRuntimeParams {
export interface ModelArtifact {
mmproj?: string;
llama_model_path?: string;
model_path?: string;
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { existsSync } from 'fs';
import { join } from 'node:path';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { InitCliUsecases } from '../usecases/init.cli.usecases';
import { checkModelCompatibility } from '@/utils/model-check';

@SubCommand({
name: 'pull',
Expand Down Expand Up @@ -35,6 +36,8 @@ export class ModelPullCommand extends CommandRunner {
}
const modelId = passedParams[0];

checkModelCompatibility(modelId);

await this.modelsCliUsecases.pullModel(modelId).catch((e: Error) => {
if (e instanceof ModelNotFoundException)
console.error('Model does not exist.');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { existsSync } from 'node:fs';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { join } from 'node:path';
import { Engines } from '../types/engine.interface';
import { checkModelCompatibility } from '@/utils/model-check';

type ModelStartOptions = {
attach: boolean;
Expand Down Expand Up @@ -58,9 +59,14 @@ export class ModelStartCommand extends CommandRunner {
!Array.isArray(existingModel.files) ||
/^(http|https):\/\/[^/]+\/.*/.test(existingModel.files[0])
) {
console.error('Model is not available. Please pull the model first.');
console.error(
`${modelId} not found on filesystem. Please try 'cortex pull ${modelId}' first.`,
);
process.exit(1);
}

checkModelCompatibility(modelId);

const engine = existingModel.engine || 'cortex.llamacpp';
// Pull engine if not exist
if (
Expand All @@ -72,10 +78,7 @@ export class ModelStartCommand extends CommandRunner {
engine,
);
}
if (engine === Engines.onnx && process.platform !== 'win32') {
console.error('The ONNX engine does not support this OS yet.');
process.exit(1);
}

await this.cortexUsecases
.startCortex(options.attach)
.then(() => this.modelsCliUsecases.startModel(modelId, options.preset))
Expand Down
4 changes: 2 additions & 2 deletions cortex-js/src/infrastructure/commanders/serve.command.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ export class ServeCommand extends CommandRunner {
console.log(
chalk.blue(`API Playground available at http://${host}:${port}/api`),
);
} catch (err) {
console.error(err.message ?? err);
} catch {
console.error(`Failed to start server. Is port ${port} in use?`);
}
}

Expand Down
13 changes: 7 additions & 6 deletions cortex-js/src/infrastructure/commanders/shortcuts/run.command.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { join } from 'path';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { InitCliUsecases } from '../usecases/init.cli.usecases';
import { Engines } from '../types/engine.interface';
import { checkModelCompatibility } from '@/utils/model-check';

type RunOptions = {
threadId?: string;
Expand Down Expand Up @@ -55,7 +56,9 @@ export class RunCommand extends CommandRunner {
// If not exist
// Try Pull
if (!(await this.modelsCliUsecases.getModel(modelId))) {
console.log(`Model ${modelId} not found. Try pulling model...`);
console.log(
`${modelId} not found on filesystem. Downloading from remote: https://huggingface.co/cortexhub if possible.`,
);
await this.modelsCliUsecases.pullModel(modelId).catch((e: Error) => {
if (e instanceof ModelNotFoundException)
console.error('Model does not exist.');
Expand All @@ -71,10 +74,12 @@ export class RunCommand extends CommandRunner {
!Array.isArray(existingModel.files) ||
/^(http|https):\/\/[^/]+\/.*/.test(existingModel.files[0])
) {
console.error('Model is not available. Please pull the model first.');
console.error('Model is not available.');
process.exit(1);
}

checkModelCompatibility(modelId);

const engine = existingModel.engine || 'cortex.llamacpp';
// Pull engine if not exist
if (
Expand All @@ -86,10 +91,6 @@ export class RunCommand extends CommandRunner {
engine,
);
}
if (engine === Engines.onnx && process.platform !== 'win32') {
console.error('The ONNX engine does not support this OS yet.');
process.exit(1);
}

return this.cortexUsecases
.startCortex(false)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
} from '@/infrastructure/constants/cortex';
import { checkNvidiaGPUExist, cudaVersion } from '@/utils/cuda';
import { Engines } from '../types/engine.interface';
import { checkModelCompatibility } from '@/utils/model-check';

@Injectable()
export class InitCliUsecases {
Expand Down Expand Up @@ -71,11 +72,6 @@ export class InitCliUsecases {
)
await this.installLlamaCppEngine(options, version);

if (engine === Engines.onnx && process.platform !== 'win32') {
console.error('The ONNX engine does not support this OS yet.');
process.exit(1);
}

if (engine !== 'cortex.llamacpp')
await this.installAcceleratedEngine('latest', engine);

Expand Down
202 changes: 30 additions & 172 deletions cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
import { exit } from 'node:process';
import { ModelsUsecases } from '@/usecases/models/models.usecases';
import { Model } from '@/domain/models/model.interface';
import { CreateModelDto } from '@/infrastructure/dtos/models/create-model.dto';
import { HuggingFaceRepoData } from '@/domain/models/huggingface.interface';
import { InquirerService } from 'nest-commander';
import { Inject, Injectable } from '@nestjs/common';
import { Presets, SingleBar } from 'cli-progress';
import { LLAMA_2 } from '@/infrastructure/constants/prompt-constants';

import { HttpService } from '@nestjs/axios';
import { StartModelSuccessDto } from '@/infrastructure/dtos/models/start-model-success.dto';
import { UpdateModelDto } from '@/infrastructure/dtos/models/update-model.dto';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { join, basename } from 'path';
import { join } from 'path';
import { load } from 'js-yaml';
import { existsSync, readdirSync, readFileSync } from 'fs';
import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id';
import { fetchJanRepoData, getHFModelMetadata } from '@/utils/huggingface';
import { createWriteStream, mkdirSync, promises } from 'node:fs';
import { firstValueFrom } from 'rxjs';
import { Engines } from '../types/engine.interface';
import { isLocalModel } from '@/utils/normalize-model-id';
import { HuggingFaceRepoSibling } from '@/domain/models/huggingface.interface';

@Injectable()
export class ModelsCliUsecases {
Expand Down Expand Up @@ -120,170 +114,34 @@ export class ModelsCliUsecases {
console.error('Model already exists');
process.exit(1);
}

if (modelId.includes('onnx') || modelId.includes('tensorrt')) {
await this.pullEngineModelFiles(modelId);
} else {
await this.pullGGUFModel(modelId);
const bar = new SingleBar({}, Presets.shades_classic);
bar.start(100, 0);
const callback = (progress: number) => {
bar.update(progress);
};

try {
await this.modelsUsecases.downloadModel(modelId, callback);

const model = await this.modelsUsecases.findOne(modelId);
const fileUrl = join(
await this.fileService.getModelsPath(),
normalizeModelId(modelId),
basename((model?.files as string[])[0]),
);
await this.modelsUsecases.update(modelId, {
files: [fileUrl],
name: modelId.replace(':default', ''),
});
} catch (err) {
bar.stop();
throw err;
}
}
}

/**
* It's to pull engine model files from HuggingFace repository
* @param modelId
*/
private async pullEngineModelFiles(modelId: string) {
const modelsContainerDir = await this.fileService.getModelsPath();

if (!existsSync(modelsContainerDir)) {
mkdirSync(modelsContainerDir, { recursive: true });
}

const modelFolder = join(modelsContainerDir, normalizeModelId(modelId));
await promises.mkdir(modelFolder, { recursive: true }).catch(() => {});

const files = (await fetchJanRepoData(modelId)).siblings;
for (const file of files) {
console.log(`Downloading ${file.rfilename}`);
const bar = new SingleBar({}, Presets.shades_classic);
bar.start(100, 0);
const response = await firstValueFrom(
this.httpService.get(file.downloadUrl ?? '', {
responseType: 'stream',
}),
);
if (!response) {
throw new Error('Failed to download model');
}

await new Promise((resolve, reject) => {
const writer = createWriteStream(join(modelFolder, file.rfilename));
let receivedBytes = 0;
const totalBytes = response.headers['content-length'];

writer.on('finish', () => {
resolve(true);
});

writer.on('error', (error) => {
reject(error);
});

response.data.on('data', (chunk: any) => {
receivedBytes += chunk.length;
bar.update(Math.floor((receivedBytes / totalBytes) * 100));
});

response.data.pipe(writer);
await this.modelsUsecases.pullModel(modelId, true, (files) => {
return new Promise<HuggingFaceRepoSibling>(async (resolve) => {
const listChoices = files
.filter((e) => e.quantization != null)
.map((e) => {
return {
name: e.quantization,
value: e.quantization,
};
});

if (listChoices.length > 1) {
const { quantization } = await this.inquirerService.inquirer.prompt({
type: 'list',
name: 'quantization',
message: 'Select quantization',
choices: listChoices,
});
resolve(
files
.filter((e) => !!e.quantization)
.find((e: any) => e.quantization === quantization) ?? files[0],
);
} else {
resolve(files.find((e) => e.rfilename.includes('.gguf')) ?? files[0]);
}
});
bar.stop();
}

const model: CreateModelDto = load(
readFileSync(join(modelFolder, 'model.yml'), 'utf-8'),
) as CreateModelDto;
model.files = [join(modelFolder)];
model.model = modelId;

if (!(await this.modelsUsecases.findOne(modelId)))
await this.modelsUsecases.create(model);

if (model.engine === Engines.tensorrtLLM) {
if (process.platform === 'win32')
console.log(
'Please ensure that you install MPI and its SDK to use the TensorRT engine, as it also requires the Cuda Toolkit 12.3 to work. Refs:\n- https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisetup.exe\n- https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisdk.msi',
);
else if (process.platform === 'linux')
console.log(
'Please ensure that you install OpenMPI and its SDK to use the TensorRT engine, as it also requires the Cuda Toolkit 12.3 to work.\nYou can install OpenMPI by running "sudo apt update && sudo apt install openmpi-bin libopenmpi-dev"',
);
}
}
/**
* It's to pull model from HuggingFace repository
* It could be a model from Jan's repo or other authors
* @param modelId HuggingFace model id. e.g. "janhq/llama-3 or llama3:7b"
*/
private async pullGGUFModel(modelId: string) {
const data: HuggingFaceRepoData =
await this.modelsUsecases.fetchModelMetadata(modelId);

let modelVersion;

const listChoices = data.siblings
.filter((e) => e.quantization != null)
.map((e) => {
return {
name: e.quantization,
value: e.quantization,
};
});

if (listChoices.length > 1) {
const { quantization } = await this.inquirerService.inquirer.prompt({
type: 'list',
name: 'quantization',
message: 'Select quantization',
choices: listChoices,
});
modelVersion = data.siblings
.filter((e) => !!e.quantization)
.find((e: any) => e.quantization === quantization);
} else {
modelVersion = data.siblings.find((e) => e.rfilename.includes('.gguf'));
}

if (!modelVersion) throw 'No expected quantization found';
const metadata = await getHFModelMetadata(modelVersion.downloadUrl!);

const promptTemplate = metadata?.promptTemplate ?? LLAMA_2;
const stopWords: string[] = [metadata?.stopWord ?? ''];

const model: CreateModelDto = {
files: [modelVersion.downloadUrl ?? ''],
model: modelId,
name: modelId,
prompt_template: promptTemplate,
stop: stopWords,

// Default Inference Params
stream: true,
max_tokens: 4098,
frequency_penalty: 0.7,
presence_penalty: 0.7,
temperature: 0.7,
top_p: 0.7,

// Default Model Settings
ctx_len: 4096,
ngl: 100,
engine: Engines.llamaCPP,
};
if (!(await this.modelsUsecases.findOne(modelId)))
await this.modelsUsecases.create(model);
});
}

/**
Expand Down
Loading