-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
obsidian.ts
264 lines (229 loc) Β· 8.12 KB
/
obsidian.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import type { basename as BasenameT } from "node:path";
import type { readFile as ReadFileT, stat as StatT } from "node:fs/promises";
import yaml from "js-yaml";
import { Document } from "@langchain/core/documents";
import { getEnv } from "@langchain/core/utils/env";
import { DirectoryLoader, UnknownHandling } from "./directory.js";
import { BaseDocumentLoader } from "../base.js";
export type FrontMatter = {
title?: string;
description?: string;
tags?: string[] | string;
[key: string]: unknown;
};
export interface ObsidianFileLoaderOptions {
encoding?: BufferEncoding;
collectMetadata?: boolean;
}
/**
* Represents a loader for Obsidian markdown files. This loader extends the BaseDocumentLoader
* and provides functionality to parse and extract metadata, tags, and dataview fields from
* Obsidian markdown files.
*/
class ObsidianFileLoader extends BaseDocumentLoader {
private filePath: string;
private encoding: BufferEncoding;
private collectMetadata: boolean;
/**
* Initializes a new instance of the ObsidianFileLoader class.
* @param filePath The path to the Obsidian markdown file.
* @param encoding The character encoding to use when reading the file. Defaults to 'utf-8'.
* @param collectMetadata Determines whether metadata should be collected from the file. Defaults to true.
*/
constructor(
filePath: string,
{
encoding = "utf-8",
collectMetadata = true,
}: ObsidianFileLoaderOptions = {}
) {
super();
this.filePath = filePath;
this.encoding = encoding;
this.collectMetadata = collectMetadata;
}
private static FRONT_MATTER_REGEX = /^---\n(.*?)\n---\n/s;
/**
* Parses the YAML front matter from the given content string.
* @param content The string content of the markdown file.
* @returns An object representing the parsed front matter.
*/
private parseFrontMatter(content: string): FrontMatter {
if (!this.collectMetadata) {
return {};
}
const match = content.match(ObsidianFileLoader.FRONT_MATTER_REGEX);
if (!match) {
return {};
}
try {
const frontMatter = yaml.load(match[1]) as FrontMatter;
if (frontMatter.tags && typeof frontMatter.tags === "string") {
frontMatter.tags = frontMatter.tags.split(", ");
}
return frontMatter;
} catch (e) {
console.warn("Encountered non-yaml frontmatter");
return {};
}
}
/**
* Removes YAML front matter from the given content string.
* @param content The string content of the markdown file.
* @returns The content string with the front matter removed.
*/
private removeFrontMatter(content: string): string {
if (!this.collectMetadata) {
return content;
}
return content.replace(ObsidianFileLoader.FRONT_MATTER_REGEX, "");
}
private static TAG_REGEX = /(?:\s|^)#([a-zA-Z_][\w/-]*)/g;
/**
* Parses Obsidian-style tags from the given content string.
* @param content The string content of the markdown file.
* @returns A set of parsed tags.
*/
private parseObsidianTags(content: string): Set<string> {
if (!this.collectMetadata) {
return new Set();
}
const matches = content.matchAll(ObsidianFileLoader.TAG_REGEX);
const tags = new Set<string>();
for (const match of matches) {
tags.add(match[1]);
}
return tags;
}
private static DATAVIEW_LINE_REGEX = /^\s*(\w+)::\s*(.*)$/gm;
private static DATAVIEW_INLINE_BRACKET_REGEX = /\[(\w+)::\s*(.*)\]/gm;
private static DATAVIEW_INLINE_PAREN_REGEX = /\((\w+)::\s*(.*)\)/gm;
/**
* Parses dataview fields from the given content string.
* @param content The string content of the markdown file.
* @returns A record object containing key-value pairs of dataview fields.
*/
private parseObsidianDataviewFields(content: string): Record<string, string> {
if (!this.collectMetadata) {
return {};
}
const fields: Record<string, string> = {};
const lineMatches = content.matchAll(
ObsidianFileLoader.DATAVIEW_LINE_REGEX
);
for (const [, key, value] of lineMatches) {
fields[key] = value;
}
const bracketMatches = content.matchAll(
ObsidianFileLoader.DATAVIEW_INLINE_BRACKET_REGEX
);
for (const [, key, value] of bracketMatches) {
fields[key] = value;
}
const parenMatches = content.matchAll(
ObsidianFileLoader.DATAVIEW_INLINE_PAREN_REGEX
);
for (const [, key, value] of parenMatches) {
fields[key] = value;
}
return fields;
}
/**
* Converts metadata to a format compatible with Langchain.
* @param metadata The metadata object to convert.
* @returns A record object containing key-value pairs of Langchain-compatible metadata.
*/
private toLangchainCompatibleMetadata(metadata: Record<string, unknown>) {
const result: Record<string, unknown> = {};
for (const [key, value] of Object.entries(metadata)) {
if (typeof value === "string" || typeof value === "number") {
result[key] = value;
} else {
result[key] = JSON.stringify(value);
}
}
return result;
}
/**
* It loads the Obsidian file, parses it, and returns a `Document` instance.
* @returns An array of `Document` instances to comply with the BaseDocumentLoader interface.
*/
public async load(): Promise<Document[]> {
const documents: Document[] = [];
const { basename, readFile, stat } = await ObsidianFileLoader.imports();
const fileName = basename(this.filePath);
const stats = await stat(this.filePath);
let content = await readFile(this.filePath, this.encoding);
const frontMatter = this.parseFrontMatter(content);
const tags = this.parseObsidianTags(content);
const dataviewFields = this.parseObsidianDataviewFields(content);
content = this.removeFrontMatter(content);
const metadata: Document["metadata"] = {
source: fileName,
path: this.filePath,
created: stats.birthtimeMs,
lastModified: stats.mtimeMs,
lastAccessed: stats.atimeMs,
...this.toLangchainCompatibleMetadata(frontMatter),
...dataviewFields,
};
if (tags.size || frontMatter.tags) {
metadata.tags = Array.from(
new Set([...tags, ...(frontMatter.tags ?? [])])
).join(",");
}
documents.push(
new Document({
pageContent: content,
metadata,
})
);
return documents;
}
/**
* Imports the necessary functions from the `node:path` and
* `node:fs/promises` modules. It is used to dynamically import the
* functions when needed. If the import fails, it throws an error
* indicating that the modules failed to load.
* @returns A promise that resolves to an object containing the imported functions.
*/
static async imports(): Promise<{
basename: typeof BasenameT;
readFile: typeof ReadFileT;
stat: typeof StatT;
}> {
try {
const { basename } = await import("node:path");
const { readFile, stat } = await import("node:fs/promises");
return { basename, readFile, stat };
} catch (e) {
console.error(e);
throw new Error(
`Failed to load fs/promises. ObsidianFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`
);
}
}
}
/**
* Represents a loader for directories containing Obsidian markdown files. This loader extends
* the DirectoryLoader and provides functionality to load and parse '.md' files with YAML frontmatter,
* Obsidian tags, and Dataview fields.
*/
export class ObsidianLoader extends DirectoryLoader {
/**
* Initializes a new instance of the ObsidianLoader class.
* @param directoryPath The path to the directory containing Obsidian markdown files.
* @param encoding The character encoding to use when reading files. Defaults to 'utf-8'.
* @param collectMetadata Determines whether metadata should be collected from the files. Defaults to true.
*/
constructor(directoryPath: string, options?: ObsidianFileLoaderOptions) {
super(
directoryPath,
{
".md": (filePath) => new ObsidianFileLoader(filePath, options),
},
true,
UnknownHandling.Ignore
);
}
}