-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
hnswlib.ts
354 lines (328 loc) Β· 12.1 KB
/
hnswlib.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
import type {
HierarchicalNSW as HierarchicalNSWT,
SpaceName,
} from "hnswlib-node";
import type { EmbeddingsInterface } from "@langchain/core/embeddings";
import { SaveableVectorStore } from "@langchain/core/vectorstores";
import { Document } from "@langchain/core/documents";
import { SynchronousInMemoryDocstore } from "../stores/doc/in_memory.js";
/**
* Interface for the base configuration of HNSWLib. It includes the space
* name and the number of dimensions.
*/
export interface HNSWLibBase {
space: SpaceName;
numDimensions?: number;
}
/**
* Interface for the arguments that can be passed to the HNSWLib
* constructor. It extends HNSWLibBase and includes properties for the
* document store and HNSW index.
*/
export interface HNSWLibArgs extends HNSWLibBase {
docstore?: SynchronousInMemoryDocstore;
index?: HierarchicalNSWT;
}
/**
* Class that implements a vector store using Hierarchical Navigable Small
* World (HNSW) graphs. It extends the SaveableVectorStore class and
* provides methods for adding documents and vectors, performing
* similarity searches, and saving and loading the vector store.
*/
export class HNSWLib extends SaveableVectorStore {
declare FilterType: (doc: Document) => boolean;
_index?: HierarchicalNSWT;
docstore: SynchronousInMemoryDocstore;
args: HNSWLibBase;
_vectorstoreType(): string {
return "hnswlib";
}
constructor(embeddings: EmbeddingsInterface, args: HNSWLibArgs) {
super(embeddings, args);
this._index = args.index;
this.args = args;
this.embeddings = embeddings;
this.docstore = args?.docstore ?? new SynchronousInMemoryDocstore();
}
/**
* Method to add documents to the vector store. It first converts the
* documents to vectors using the embeddings, then adds the vectors to the
* vector store.
* @param documents The documents to be added to the vector store.
* @returns A Promise that resolves when the documents have been added.
*/
async addDocuments(documents: Document[]): Promise<void> {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(
await this.embeddings.embedDocuments(texts),
documents
);
}
private static async getHierarchicalNSW(args: HNSWLibBase) {
const { HierarchicalNSW } = await HNSWLib.imports();
if (!args.space) {
throw new Error("hnswlib-node requires a space argument");
}
if (args.numDimensions === undefined) {
throw new Error("hnswlib-node requires a numDimensions argument");
}
return new HierarchicalNSW(args.space, args.numDimensions);
}
private async initIndex(vectors: number[][]) {
if (!this._index) {
if (this.args.numDimensions === undefined) {
this.args.numDimensions = vectors[0].length;
}
this.index = await HNSWLib.getHierarchicalNSW(this.args);
}
if (!this.index.getCurrentCount()) {
this.index.initIndex(vectors.length);
}
}
public get index(): HierarchicalNSWT {
if (!this._index) {
throw new Error(
"Vector store not initialised yet. Try calling `addTexts` first."
);
}
return this._index;
}
private set index(index: HierarchicalNSWT) {
this._index = index;
}
/**
* Method to add vectors to the vector store. It first initializes the
* index if it hasn't been initialized yet, then adds the vectors to the
* index and the documents to the document store.
* @param vectors The vectors to be added to the vector store.
* @param documents The documents corresponding to the vectors.
* @returns A Promise that resolves when the vectors and documents have been added.
*/
async addVectors(vectors: number[][], documents: Document[]) {
if (vectors.length === 0) {
return;
}
await this.initIndex(vectors);
// TODO here we could optionally normalise the vectors to unit length
// so that dot product is equivalent to cosine similarity, like this
// https://github.com/nmslib/hnswlib/issues/384#issuecomment-1155737730
// While we only support OpenAI embeddings this isn't necessary
if (vectors.length !== documents.length) {
throw new Error(`Vectors and metadatas must have the same length`);
}
if (vectors[0].length !== this.args.numDimensions) {
throw new Error(
`Vectors must have the same length as the number of dimensions (${this.args.numDimensions})`
);
}
const capacity = this.index.getMaxElements();
const needed = this.index.getCurrentCount() + vectors.length;
if (needed > capacity) {
this.index.resizeIndex(needed);
}
const docstoreSize = this.index.getCurrentCount();
const toSave: Record<string, Document> = {};
for (let i = 0; i < vectors.length; i += 1) {
this.index.addPoint(vectors[i], docstoreSize + i);
toSave[docstoreSize + i] = documents[i];
}
this.docstore.add(toSave);
}
/**
* Method to perform a similarity search in the vector store using a query
* vector. It returns the k most similar documents along with their
* similarity scores. An optional filter function can be provided to
* filter the documents.
* @param query The query vector.
* @param k The number of most similar documents to return.
* @param filter An optional filter function to filter the documents.
* @returns A Promise that resolves to an array of tuples, where each tuple contains a document and its similarity score.
*/
async similaritySearchVectorWithScore(
query: number[],
k: number,
filter?: this["FilterType"]
) {
if (this.args.numDimensions && !this._index) {
await this.initIndex([[]]);
}
if (query.length !== this.args.numDimensions) {
throw new Error(
`Query vector must have the same length as the number of dimensions (${this.args.numDimensions})`
);
}
if (k > this.index.getCurrentCount()) {
const total = this.index.getCurrentCount();
console.warn(
`k (${k}) is greater than the number of elements in the index (${total}), setting k to ${total}`
);
// eslint-disable-next-line no-param-reassign
k = total;
}
const filterFunction = (label: number): boolean => {
if (!filter) {
return true;
}
const document = this.docstore.search(String(label));
// eslint-disable-next-line no-instanceof/no-instanceof
if (typeof document !== "string") {
return filter(document);
}
return false;
};
const result = this.index.searchKnn(
query,
k,
filter ? filterFunction : undefined
);
return result.neighbors.map(
(docIndex, resultIndex) =>
[
this.docstore.search(String(docIndex)),
result.distances[resultIndex],
] as [Document, number]
);
}
/**
* Method to delete the vector store from a directory. It deletes the
* hnswlib.index file, the docstore.json file, and the args.json file from
* the directory.
* @param params An object with a directory property that specifies the directory from which to delete the vector store.
* @returns A Promise that resolves when the vector store has been deleted.
*/
async delete(params: { directory: string }) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
try {
await fs.access(path.join(params.directory, "hnswlib.index"));
} catch (err) {
throw new Error(
`Directory ${params.directory} does not contain a hnswlib.index file.`
);
}
await Promise.all([
await fs.rm(path.join(params.directory, "hnswlib.index"), {
force: true,
}),
await fs.rm(path.join(params.directory, "docstore.json"), {
force: true,
}),
await fs.rm(path.join(params.directory, "args.json"), { force: true }),
]);
}
/**
* Method to save the vector store to a directory. It saves the HNSW
* index, the arguments, and the document store to the directory.
* @param directory The directory to which to save the vector store.
* @returns A Promise that resolves when the vector store has been saved.
*/
async save(directory: string) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
await fs.mkdir(directory, { recursive: true });
await Promise.all([
this.index.writeIndex(path.join(directory, "hnswlib.index")),
await fs.writeFile(
path.join(directory, "args.json"),
JSON.stringify(this.args)
),
await fs.writeFile(
path.join(directory, "docstore.json"),
JSON.stringify(Array.from(this.docstore._docs.entries()))
),
]);
}
/**
* Static method to load a vector store from a directory. It reads the
* HNSW index, the arguments, and the document store from the directory,
* then creates a new HNSWLib instance with these values.
* @param directory The directory from which to load the vector store.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async load(directory: string, embeddings: EmbeddingsInterface) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
const args = JSON.parse(
await fs.readFile(path.join(directory, "args.json"), "utf8")
);
const index = await HNSWLib.getHierarchicalNSW(args);
const [docstoreFiles] = await Promise.all([
fs
.readFile(path.join(directory, "docstore.json"), "utf8")
.then(JSON.parse),
index.readIndex(path.join(directory, "hnswlib.index")),
]);
args.docstore = new SynchronousInMemoryDocstore(new Map(docstoreFiles));
args.index = index;
return new HNSWLib(embeddings, args);
}
/**
* Static method to create a new HNSWLib instance from texts and metadata.
* It creates a new Document instance for each text and metadata, then
* calls the fromDocuments method to create the HNSWLib instance.
* @param texts The texts to be used to create the documents.
* @param metadatas The metadata to be used to create the documents.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @param dbConfig An optional configuration object for the document store.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async fromTexts(
texts: string[],
metadatas: object[] | object,
embeddings: EmbeddingsInterface,
dbConfig?: {
docstore?: SynchronousInMemoryDocstore;
}
): Promise<HNSWLib> {
const docs: Document[] = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = new Document({
pageContent: texts[i],
metadata,
});
docs.push(newDoc);
}
return HNSWLib.fromDocuments(docs, embeddings, dbConfig);
}
/**
* Static method to create a new HNSWLib instance from documents. It
* creates a new HNSWLib instance, adds the documents to it, then returns
* the instance.
* @param docs The documents to be added to the HNSWLib instance.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @param dbConfig An optional configuration object for the document store.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async fromDocuments(
docs: Document[],
embeddings: EmbeddingsInterface,
dbConfig?: {
docstore?: SynchronousInMemoryDocstore;
}
): Promise<HNSWLib> {
const args: HNSWLibArgs = {
docstore: dbConfig?.docstore,
space: "cosine",
};
const instance = new this(embeddings, args);
await instance.addDocuments(docs);
return instance;
}
static async imports(): Promise<{
HierarchicalNSW: typeof HierarchicalNSWT;
}> {
try {
const {
default: { HierarchicalNSW },
} = await import("hnswlib-node");
return { HierarchicalNSW };
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (err: any) {
throw new Error(
`Could not import hnswlib-node. Please install hnswlib-node as a dependency with, e.g. \`npm install -S hnswlib-node\`.\n\nError: ${err?.message}`
);
}
}
}