In [4]:
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
import json

In [8]:
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key=os.getenv("GROQ_API_KEY")

In [30]:
# Step 1: Define Output Schema using Pydantic
class WebhookOutput(BaseModel):
    invoice_id: str = Field(..., description="The invoice ID from the webhook")
    amount: float = Field(..., description="The invoice amount")
    currency: str = Field(..., description="Currency code like USD, INR")
    customer_id: str = Field(..., description="Customer's unique ID")
    timestamp: str = Field(..., description="ISO8601 timestamp of the invoice")
    anomalies: str = Field(..., description="Any issues like missing fields, wrong types, or suspicious data")

output_parser = PydanticOutputParser(pydantic_object=WebhookOutput)


In [27]:
# Step 2: Setup Few-Shot Examples
examples = [
    {
        "input": '{{"id": "INV-9001", "total": 1200, "cur": "USD", "cust": "CUST-001", "time": "2025-06-01T15:00:00Z"}}',
        "output": '''{{
            "invoice_id": "INV-9001",
            "amount": 1200,
            "currency": "USD",
            "customer_id": "CUST-001",
            "timestamp": "2025-06-01T15:00:00Z",
            "anomalies": "Field names differ from expected: id→invoice_id, total→amount, cur→currency, etc."
        }}'''
    },
    {
        "input": '{{"invoice_id": "INV-1002", "amount": "not available", "currency": "USD"}}',
        "output":'''{{
            "invoice_id": "INV-1002",
            "amount": 0,
            "currency": "USD",
            "customer_id": "",
            "timestamp": "",
            "anomalies": "Amount not a number. Missing customer_id and timestamp."
        }}'''
    }
]

example_prompt = PromptTemplate(
    input_variables=["input", "output"],
    template="""
Input JSON:
{input}

Expected Output:
{output}
"""
)

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Now extract the following fields from this new JSON:\n{input}\n\n{format_instructions}",
    input_variables=["input", "format_instructions"],
    example_separator="\n\n"
)



In [31]:

# Step 3: Setup LLM Chain
llm = ChatGroq(model="Gemma2-9b-It" , groq_api_key=groq_api_key)

from langchain.chains import LLMChain
json_agent_chain = LLMChain(llm=llm, prompt=few_shot_prompt)

In [23]:
# Step 4: Run on a New JSON Payload
incoming_payload = {
    "invoice_code": "INV-8888",
    "amount": 950.0,
    "currency": "EUR",
    "client_id": "CUST-089",
    "time_issued": "2025-06-02T10:30:00Z"
}

safe_format_instructions = output_parser.get_format_instructions().replace("{", "{{").replace("}", "}}")

input_json = json.dumps(incoming_payload, indent=2)

In [28]:
print(few_shot_prompt.format(input=input_json, format_instructions=safe_format_instructions))


Input JSON:
{"id": "INV-9001", "total": 1200, "cur": "USD", "cust": "CUST-001", "time": "2025-06-01T15:00:00Z"}

Expected Output:
{
            "invoice_id": "INV-9001",
            "amount": 1200,
            "currency": "USD",
            "customer_id": "CUST-001",
            "timestamp": "2025-06-01T15:00:00Z",
            "anomalies": "Field names differ from expected: id→invoice_id, total→amount, cur→currency, etc."
        }



Input JSON:
{"invoice_id": "INV-1002", "amount": "not available", "currency": "USD"}

Expected Output:
{
            "invoice_id": "INV-1002",
            "amount": 0,
            "currency": "USD",
            "customer_id": "",
            "timestamp": "",
            "anomalies": "Amount not a number. Missing customer_id and timestamp."
        }


Now extract the following fields from this new JSON:
{
  "invoice_code": "INV-8888",
  "amount": 950.0,
  "currency": "EUR",
  "client_id": "CUST-089",
  "time_issued": "2025-06-02T10:30:00Z"
}

The output 

In [32]:

output_text = json_agent_chain.run(input=input_json, format_instructions=safe_format_instructions)


In [33]:
output_text

'```json\n{\n  "invoice_id": "INV-8888",\n  "amount": 950.0,\n  "currency": "EUR",\n  "customer_id": "CUST-089",\n  "timestamp": "2025-06-02T10:30:00Z",\n  "anomalies": "" \n}\n``` \n\n\nLet me know if you\'d like to see how this process would handle a different JSON input with potential anomalies!\n'

In [35]:

parsed_output = parser.parse(output_text)

print("🧾 LLM Output (Parsed):")
print(json.dumps(parsed_output.model_dump(), indent=2))

🧾 LLM Output (Parsed):
{
  "invoice_id": "INV-8888",
  "amount": 950.0,
  "currency": "EUR",
  "customer_id": "CUST-089",
  "timestamp": "2025-06-02T10:30:00Z",
  "anomalies": ""
}
