-
Notifications
You must be signed in to change notification settings - Fork 2k
/
llama_cpp.ts
111 lines (102 loc) · 3.32 KB
/
llama_cpp.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import {
LlamaModel,
LlamaContext,
LlamaChatSession,
LlamaJsonSchemaGrammar,
LlamaGrammar,
GbnfJsonSchema,
} from "node-llama-cpp";
/**
* Note that the modelPath is the only required parameter. For testing you
* can set this in the environment variable `LLAMA_PATH`.
*/
export interface LlamaBaseCppInputs {
/** Prompt processing batch size. */
batchSize?: number;
/** Text context size. */
contextSize?: number;
/** Embedding mode only. */
embedding?: boolean;
/** Use fp16 for KV cache. */
f16Kv?: boolean;
/** Number of layers to store in VRAM. */
gpuLayers?: number;
/** The llama_eval() call computes all logits, not just the last one. */
logitsAll?: boolean;
/** */
maxTokens?: number;
/** Path to the model on the filesystem. */
modelPath: string;
/** Add the begining of sentence token. */
prependBos?: boolean;
/** If null, a random seed will be used. */
seed?: null | number;
/** The randomness of the responses, e.g. 0.1 deterministic, 1.5 creative, 0.8 balanced, 0 disables. */
temperature?: number;
/** Number of threads to use to evaluate tokens. */
threads?: number;
/** Trim whitespace from the end of the generated text Disabled by default. */
trimWhitespaceSuffix?: boolean;
/** Consider the n most likely tokens, where n is 1 to vocabulary size, 0 disables (uses full vocabulary). Note: only applies when `temperature` > 0. */
topK?: number;
/** Selects the smallest token set whose probability exceeds P, where P is between 0 - 1, 1 disables. Note: only applies when `temperature` > 0. */
topP?: number;
/** Force system to keep model in RAM. */
useMlock?: boolean;
/** Use mmap if possible. */
useMmap?: boolean;
/** Only load the vocabulary, no weights. */
vocabOnly?: boolean;
/** JSON schema to be used to format output. Also known as `grammar`. */
jsonSchema?: object;
/** GBNF string to be used to format output. Also known as `grammar`. */
gbnf?: string;
}
export function createLlamaModel(inputs: LlamaBaseCppInputs): LlamaModel {
const options = {
gpuLayers: inputs?.gpuLayers,
modelPath: inputs.modelPath,
useMlock: inputs?.useMlock,
useMmap: inputs?.useMmap,
vocabOnly: inputs?.vocabOnly,
jsonSchema: inputs?.jsonSchema,
gbnf: inputs?.gbnf,
};
return new LlamaModel(options);
}
export function createLlamaContext(
model: LlamaModel,
inputs: LlamaBaseCppInputs
): LlamaContext {
const options = {
batchSize: inputs?.batchSize,
contextSize: inputs?.contextSize,
embedding: inputs?.embedding,
f16Kv: inputs?.f16Kv,
logitsAll: inputs?.logitsAll,
model,
prependBos: inputs?.prependBos,
seed: inputs?.seed,
threads: inputs?.threads,
};
return new LlamaContext(options);
}
export function createLlamaSession(context: LlamaContext): LlamaChatSession {
return new LlamaChatSession({ context });
}
export function createLlamaJsonSchemaGrammar(
schemaString: object | undefined
): LlamaJsonSchemaGrammar<GbnfJsonSchema> | undefined {
if (schemaString === undefined) {
return undefined;
}
const schemaJSON = schemaString as GbnfJsonSchema;
return new LlamaJsonSchemaGrammar(schemaJSON);
}
export function createCustomGrammar(
filePath: string | undefined
): LlamaGrammar | undefined {
return filePath === undefined
? undefined
: new LlamaGrammar({ grammar: filePath });
}