-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
xml.ts
181 lines (152 loc) Β· 4.9 KB
/
xml.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import {
BaseCumulativeTransformOutputParser,
BaseCumulativeTransformOutputParserInput,
} from "./transform.js";
import { Operation, compare } from "../utils/json_patch.js";
import { sax } from "../utils/sax-js/sax.js";
import { ChatGeneration, Generation } from "../outputs.js";
export const XML_FORMAT_INSTRUCTIONS = `The output should be formatted as a XML file.
1. Output should conform to the tags below.
2. If tags are not given, make them on your own.
3. Remember to always open and close all the tags.
As an example, for the tags ["foo", "bar", "baz"]:
1. String "<foo>\n <bar>\n <baz></baz>\n </bar>\n</foo>" is a well-formatted instance of the schema.
2. String "<foo>\n <bar>\n </foo>" is a badly-formatted instance.
3. String "<foo>\n <tag>\n </tag>\n</foo>" is a badly-formatted instance.
Here are the output tags:
\`\`\`
{tags}
\`\`\``;
export interface XMLOutputParserFields
extends BaseCumulativeTransformOutputParserInput {
/**
* Optional list of tags that the output should conform to.
* Only used in formatting of the prompt.
*/
tags?: string[];
}
export type Content = string | undefined | Array<{ [key: string]: Content }>;
export type XMLResult = {
[key: string]: Content;
};
export class XMLOutputParser extends BaseCumulativeTransformOutputParser<XMLResult> {
tags?: string[];
constructor(fields?: XMLOutputParserFields) {
super(fields);
this.tags = fields?.tags;
}
static lc_name() {
return "XMLOutputParser";
}
lc_namespace = ["langchain_core", "output_parsers"];
lc_serializable = true;
protected _diff(
prev: unknown | undefined,
next: unknown
): Operation[] | undefined {
if (!next) {
return undefined;
}
if (!prev) {
return [{ op: "replace", path: "", value: next }];
}
return compare(prev, next);
}
async parsePartialResult(
generations: ChatGeneration[] | Generation[]
): Promise<XMLResult | undefined> {
return parseXMLMarkdown(generations[0].text);
}
async parse(text: string): Promise<XMLResult> {
return parseXMLMarkdown(text);
}
getFormatInstructions(): string {
const withTags = !!(this.tags && this.tags.length > 0);
return withTags
? XML_FORMAT_INSTRUCTIONS.replace("{tags}", this.tags?.join(", ") ?? "")
: XML_FORMAT_INSTRUCTIONS;
}
}
const strip = (text: string) =>
text
.split("\n")
.map((line) => line.replace(/^\s+/, ""))
.join("\n")
.trim();
type ParsedResult = {
name: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
attributes: Record<string, any>;
children: Array<ParsedResult>;
text?: string;
isSelfClosing: boolean;
};
const parseParsedResult = (input: ParsedResult): XMLResult => {
if (Object.keys(input).length === 0) {
return {};
}
const result: XMLResult = {};
if (input.children.length > 0) {
result[input.name] = input.children.map(parseParsedResult);
return result;
} else {
result[input.name] = input.text ?? undefined;
return result;
}
};
export function parseXMLMarkdown(s: string): XMLResult {
const cleanedString = strip(s);
const parser = sax.parser(true);
let parsedResult: ParsedResult = {} as ParsedResult;
const elementStack: ParsedResult[] = [];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parser.onopentag = (node: any) => {
const element = {
name: node.name,
attributes: node.attributes,
children: [],
text: "",
isSelfClosing: node.isSelfClosing,
};
if (elementStack.length > 0) {
const parentElement = elementStack[elementStack.length - 1];
parentElement.children.push(element);
} else {
parsedResult = element as ParsedResult;
}
if (!node.isSelfClosing) {
elementStack.push(element);
}
};
parser.onclosetag = () => {
if (elementStack.length > 0) {
const lastElement = elementStack.pop();
if (elementStack.length === 0 && lastElement) {
parsedResult = lastElement as ParsedResult;
}
}
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parser.ontext = (text: any) => {
if (elementStack.length > 0) {
const currentElement = elementStack[elementStack.length - 1];
currentElement.text += text;
}
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parser.onattribute = (attr: any) => {
if (elementStack.length > 0) {
const currentElement = elementStack[elementStack.length - 1];
currentElement.attributes[attr.name] = attr.value;
}
};
// Try to find XML string within triple backticks.
const match = /```(xml)?(.*)```/s.exec(cleanedString);
const xmlString = match ? match[2] : cleanedString;
parser.write(xmlString).close();
// Remove the XML declaration if present
if (parsedResult && parsedResult.name === "?xml") {
parsedResult = parsedResult.children[0] as ParsedResult;
}
return parseParsedResult(parsedResult);
}