1
1
import * as ls from "langsmith/vitest" ;
2
- import { expect , test , expectTypeOf } from "vitest" ;
3
- import { evaluate } from "langsmith/evaluation" ;
2
+ import { expect , expectTypeOf , beforeAll } from "vitest" ;
4
3
import { OpenAI } from "openai" ;
5
4
import { ChatOpenAI } from "@langchain/openai" ;
6
5
@@ -9,9 +8,89 @@ import * as hub from "langchain/hub";
9
8
import { HumanMessage } from "@langchain/core/messages" ;
10
9
11
10
import { z } from "zod" ;
12
- import { ChatPromptTemplate } from "@langchain/core/prompts" ;
11
+ import { ChatPromptTemplate , HumanMessagePromptTemplate , StructuredPrompt } from "@langchain/core/prompts" ;
12
+ import { Client } from "langsmith" ;
13
13
14
14
ls . describe ( "llm as judge" , ( ) => {
15
+ beforeAll ( async ( ) => {
16
+ // Setup required prompts in LangChain Hub before running tests
17
+ const client = new Client ( ) ;
18
+
19
+ // Create test-equality prompt
20
+ const testEqualityPrompt = ChatPromptTemplate . fromMessages ( [
21
+ [ "system" , "You are an expert LLM as judge." ] ,
22
+ [ "human" , "Are these two equal? {inputs} {outputs}" ] ,
23
+ ] ) ;
24
+
25
+ try {
26
+ await client . pushPrompt ( "test-equality" , { object : testEqualityPrompt } ) ;
27
+ console . log ( "Created test-equality prompt" ) ;
28
+ } catch ( error ) {
29
+ console . log ( `test-equality prompt may already exist: ${ error } ` ) ;
30
+ }
31
+
32
+ // Create equality-1-message prompt
33
+ const equality1MessagePrompt = ChatPromptTemplate . fromMessages ( [
34
+ [ "human" , "Are these two equal? {inputs} {outputs}" ] ,
35
+ ] ) ;
36
+
37
+ try {
38
+ await client . pushPrompt ( "equality-1-message" , {
39
+ object : equality1MessagePrompt ,
40
+ } ) ;
41
+ console . log ( "Created equality-1-message prompt" ) ;
42
+ } catch ( error ) {
43
+ console . log ( `equality-1-message prompt may already exist: ${ error } ` ) ;
44
+ }
45
+
46
+ // Create simple-equality-structured prompt
47
+ const structuredEqualityPrompt = new StructuredPrompt ( {
48
+ inputVariables : [ "inputs" , "outputs" ] ,
49
+ promptMessages : [
50
+ HumanMessagePromptTemplate . fromTemplate (
51
+ `Are these equal?
52
+
53
+ <item1>
54
+ {inputs}
55
+ </item1>
56
+
57
+ <item2>
58
+ {outputs}
59
+ </item2>` ,
60
+ ) ,
61
+ ] ,
62
+ schema : {
63
+ title : "score" ,
64
+ description : "Get a score" ,
65
+ type : "object" ,
66
+ properties : {
67
+ equality : {
68
+ type : "boolean" ,
69
+ description : "Whether the two items are equal" ,
70
+ } ,
71
+ justification : {
72
+ type : "string" ,
73
+ description : "Justification for your decision above" ,
74
+ } ,
75
+ } ,
76
+ required : [ "equality" , "justification" ] ,
77
+ strict : true ,
78
+ additionalProperties : false ,
79
+ } ,
80
+ } ) ;
81
+
82
+ try {
83
+ await client . pushPrompt ( "simple-equality-structured" , {
84
+ object : structuredEqualityPrompt ,
85
+ } ) ;
86
+ console . log ( "Created simple-equality-structured prompt" ) ;
87
+ } catch ( error ) {
88
+ console . log (
89
+ `simple-equality-structured prompt may already exist: ${ error } `
90
+ ) ;
91
+ }
92
+ } ) ;
93
+
15
94
ls . test (
16
95
"prompt hub prompt" ,
17
96
{
@@ -21,7 +100,7 @@ ls.describe("llm as judge", () => {
21
100
const outputs = { a : 1 , b : 2 } ;
22
101
const client = new OpenAI ( ) ;
23
102
const evaluator = createLLMAsJudge ( {
24
- prompt : await hub . pull ( "langchain-ai/ equality-1-message" ) ,
103
+ prompt : await hub . pull ( "equality-1-message" ) ,
25
104
judge : client ,
26
105
model : "openai:gpt-4o-mini" ,
27
106
} ) ;
@@ -42,7 +121,7 @@ ls.describe("llm as judge", () => {
42
121
async ( { inputs } ) => {
43
122
const outputs = { a : 1 , b : 2 } ;
44
123
const evaluator = createLLMAsJudge ( {
45
- prompt : await hub . pull ( "jacob/ simple-equality-structured" ) ,
124
+ prompt : await hub . pull ( "simple-equality-structured" ) ,
46
125
model : "openai:gpt-4o-mini" ,
47
126
} ) ;
48
127
const result = await evaluator ( { inputs, outputs } ) ;
@@ -313,19 +392,6 @@ ls.describe("llm as judge", () => {
313
392
}
314
393
) ;
315
394
316
- test ( "test llm as judge works with evaluate" , async ( ) => {
317
- const evaluator = createLLMAsJudge ( {
318
- prompt : "Are these two foo? {inputs} {outputs}" ,
319
- model : "openai:o3-mini" ,
320
- } ) ;
321
- const result = await evaluate ( ( inputs ) => inputs , {
322
- data : "exact match" ,
323
- evaluators : [ evaluator ] ,
324
- } ) ;
325
- expect ( result ) . toBeDefined ( ) ;
326
- expect ( result . results . length ) . toBeGreaterThan ( 0 ) ;
327
- } , 60000 ) ;
328
-
329
395
ls . test (
330
396
"llm as judge with mustache prompt" ,
331
397
{
0 commit comments