In [None]:
from langchain_community.chat_models import ChatOllama
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import SystemMessage, HumanMessage
from pydantic import BaseModel, Field

# Step 1: Define your output schema
class EmailAnalysis(BaseModel):
    email_text: str = Field(..., description="The content of the email")
    reason: str = Field(..., description="Explanation of whether it shows accounting abuse or not")
    score: int = Field(..., ge=0, le=10, description="Score from 0 (no abuse) to 10 (clear fraud)")

parser = PydanticOutputParser(pydantic_object=EmailAnalysis)

# Step 2: Initialize your model
llm = ChatOllama(model="deepseek-r1:14b", temperature=1.0, max_tokens=512)

# Step 3: Construct the full prompt
system_card = SystemMessage(content="""
You are a forensic accountant and finance specialist. Your task is to review Enron emails to assess whether they suggest improper or fraudulent accounting behavior.

Focus on identifying evidence that losses were intentionally hidden using practices such as:
- Special Purpose Entities (SPEs), including those referred to as "Raptors"
- Mark-to-market accounting used to artificially boost profit

Only consider emails that reflect:
- Insider knowledge
- Intent or justification
- Concern about legality or ethics

🛑 Do not score emails that are:
- News articles or summaries
- Forwarded press content
- HR, calendar invites, or logistics

Return output in structured JSON matching this schema:
- email_text: str
- reason: str
- score: int (0 = no abuse, 10 = outright fraud)
""")

# The actual email to analyze
email = "I'm concerned that the mark to market accounting practices that we used to hide losses in the Raptor SBE could be viewed as fraud."

human_prompt = HumanMessage(content=f"""
Please analyze the following email and return your result as valid JSON.

EMAIL:
\"{email}\"

{parser.get_format_instructions()}
""")

# Step 4: Call the model and parse result
response = llm([system_card, human_prompt])
parsed = parser.parse(response.content)

# Step 5: Use result
#print(parsed.json(indent=2))

/var/folders/b5/q7jfctvs3wl6sly80nxcpg3h0000gn/T/ipykernel_67154/413316477.py:59: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print(parsed.json(indent=2))


TypeError: `dumps_kwargs` keyword arguments are no longer supported.

In [23]:
def test_email_analysis(email: str):
    # The actual email to analyze
    
    human_prompt = HumanMessage(content=f"""
    Please analyze the following email and return your result as valid JSON.

    EMAIL:
    \"{email}\"

    {parser.get_format_instructions()}
    """)

    # Step 4: Call the model and parse result
    response = llm([system_card, human_prompt])

    # Step 5: Use result
    return response.content

In [11]:
!ls -lh ../data/resultsMix8_all.csv

-rw-r--r--  1 kariato  staff   1.3M Mar 21  2024 ../data/resultsMix8_all.csv


In [36]:
import pandas as pd
df = pd.read_csv("../data/resultsMix8_all.csv")
emails=df['email'].to_list()
results=[]
for email in emails:
    raw_string = test_email_analysis(email)
    # Step 1: Extract the <think> section
    think_match = re.search(r"<think>\s*(.*?)\s*</think>", raw_string, re.DOTALL)
    thinking_text = think_match.group(1).strip() if think_match else None

    # Step 2: Extract the JSON block
    json_match = re.search(r"```json\s*(\{.*?\})\s*```", raw_string, re.DOTALL)
    json_str = json_match.group(1).strip() if json_match else None

    # Step 3: Parse JSON and combine
    if thinking_text and json_str:
        try:
            parsed_dict = json.loads(json_str)
            parsed_dict["thinking"] = thinking_text
            print("✅ Combined Result:")
            results.append(parsed_dict)
            pd.DataFrame(results).to_csv("DSR1results.csv", index=False)
            # Print the combined result
            print(json.dumps(parsed_dict, indent=2))
        except json.JSONDecodeError as e:
            print("❌ JSON Parsing Error:", e)
    else:
        print("❌ Could not extract both <think> and JSON block")


✅ Combined Result:
{
  "email_text": "Subject: PNM lossesGuys, I called PNM again today to inquire about the losses that are based on 3% of the market price that is determined by PNM's marketers. Currently, they were priced at $100 market price times 3%, or $3 per MWh. Don at PNM transmission did not respond to me by yesterday as I had requested, and of course when I called him today, he is out until Monday. So I talked to a Mark who could not give me much of an update. The last time I talked to PNM, Don was going to take my request to have July's Market price that PNM was using for losses moved down to below the cap of $91.87. As of today, they have changed only August's transmission loss factor to the current cap price of $91.87 and posted this on their website as of this week. I also asked why they let their marketers decide on a market price for losses when marketers are not supposed to be involved in transmission or transmission pricing. Mark's response was that Losses are not anc

In [33]:
import json
import re

# Your input string
raw_string = result

# Step 1: Extract the <think> section
think_match = re.search(r"<think>\s*(.*?)\s*</think>", raw_string, re.DOTALL)
thinking_text = think_match.group(1).strip() if think_match else None

# Step 2: Extract the JSON block
json_match = re.search(r"```json\s*(\{.*?\})\s*```", raw_string, re.DOTALL)
json_str = json_match.group(1).strip() if json_match else None

# Step 3: Parse JSON and combine
if thinking_text and json_str:
    try:
        parsed_dict = json.loads(json_str)
        parsed_dict["thinking"] = thinking_text
        print("✅ Combined Result:")
        print(json.dumps(parsed_dict, indent=2))
    except json.JSONDecodeError as e:
        print("❌ JSON Parsing Error:", e)
else:
    print("❌ Could not extract both <think> and JSON block")

✅ Combined Result:
{
  "email_text": "Subject: PNM losses\nGuys, I called PNM again today to inquire about the losses that are based on 3% of the market price that is determined by PNM's marketers. Currently, they were priced at $100 market price times 3%, or $3 per MWh. Don at PNM transmission did not respond to me by yesterday as I had requested, and of course when I called him today, he is out until Monday. So I talked to a Mark who could not give me much of an update.\n\nThe last time I talked to PNM, Don was going to take my request to have July's Market price that PNM was using for losses moved down to below the cap of $91.87. As of today, they have changed only August's transmission loss factor to the current cap price of $91.87 and posted this on their website as of this week.\n\nI also asked why they let their marketers decide on a market price for losses when marketers are not supposed to be involved in transmission or transmission pricing. Mark's response was that Losses are