# Loading Libraries

In [122]:
import pandas as pd
import os
import email
import email.policy
from pydantic import BaseModel, ValidationError
import requests
import json
import numpy as np
import pytesseract
from PIL import Image
import cv2
from tqdm import tqdm
try:
    # pdf2image might not be installed in all environments
    from pdf2image import convert_from_bytes
    HAVE_PDF2IMAGE = True
except ImportError:
    print("pdf2image not installed. PDF OCR will fail.")
    HAVE_PDF2IMAGE = False
from sklearn.metrics import classification_report
from typing import Any

# Extracting Email Data

In [123]:
def extract_email_data(eml_file_path: str) -> dict:
    """
    Parse a .eml file, return a dictionary containing:
      {
        "from": str or None,
        "to": str or None,
        "subject": str or None,
        "body": str or None,      # Prefer plain text if available
        "attachments": list of {
            "filename": str,
            "content_type": str,
            "data": bytes
        }
      }

    Args:
        eml_file_path (str): Full path to the .eml file.

    Returns:
        dict: A dictionary with the extracted data.
    """
    # 1) Read the raw .eml text
    with open(eml_file_path, "r", encoding="utf-8", errors="replace") as f:
        raw_email = f.read()

    # 2) Parse into an EmailMessage (new-style) using a modern policy
    msg = email.message_from_string(raw_email, policy=email.policy.default)

    # 3) Extract top-level headers
    from_ = msg.get("From", "")
    to_ = msg.get("To", "")
    subject = msg.get("Subject", "")

    # 4) Walk the parts to find the best body and attachments
    body_text = None
    attachments = []

    for part in msg.walk():
        # If it's a container (multipart/*), skip it—we want actual payload parts
        if part.get_content_maintype() == "multipart":
            continue

        # Check for attachments
        filename = part.get_filename()
        if filename:
            # It's an attachment
            attach_data = part.get_payload(decode=True)
            attachments.append({
                "filename": filename,
                "content_type": part.get_content_type(),
                "data": attach_data
            })
        else:
            # Potentially a text part (plain or html or something else)
            ctype = part.get_content_type()
            if ctype == "text/plain":
                # If we haven't chosen a body yet, or we prefer plain text,
                # decode it here
                if body_text is None:
                    payload = part.get_payload(decode=True)
                    charset = part.get_content_charset() or "utf-8"
                    body_text = payload.decode(charset, errors="replace")

            elif ctype == "text/html":
                # Optionally handle HTML if no plain text was found
                if body_text is None:
                    payload = part.get_payload(decode=True)
                    charset = part.get_content_charset() or "utf-8"
                    body_text = payload.decode(charset, errors="replace")

    # Build the result
    result = {
        "from": from_,
        "to": to_,
        "subject": subject,
        "body": body_text,
        "attachments": attachments
    }
    return result


In [124]:
extract_email_data("./emails_attachments/email_pdf_text_1.eml")

{'from': 'client.relations@abcindustries.com',
 'to': 'support@bigbank.com',
 'subject': 'Request for Payment Term Adjustment',
 'body': None,
 'attachments': [{'filename': 'body_converted.pdf',
   'content_type': 'application/pdf',
   'data': b'%PDF-1.3\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com\n1 0 obj\n<<\n/F1 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font\n>>\nendobj\n3 0 obj\n<<\n/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<\n/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]\n>> /Rotate 0 /Trans <<\n\n>> \n  /Type /Page\n>>\nendobj\n4 0 obj\n<<\n/PageMode /UseNone /Pages 6 0 R /Type /Catalog\n>>\nendobj\n5 0 obj\n<<\n/Author (anonymous) /CreationDate (D:20250326085707+00\'00\') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250326085707+00\'00\') /Producer (ReportLab PDF Library - www.reportlab.com) \n  /Subjec

In [125]:
extract_email_data("./emails_image_attachments/email_pdf_text_1.eml")

{'from': 'client.relations@abcindustries.com',
 'to': 'support@bigbank.com',
 'subject': 'Request for Payment Term Adjustment',
 'body': None,
 'attachments': [{'filename': 'body_image.pdf',
   'content_type': 'application/pdf',
   'data': b'%PDF-1.3\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com\n1 0 obj\n<<\n/F1 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font\n>>\nendobj\n3 0 obj\n<<\n/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 400 /Length 21779 /Subtype /Image \n  /Type /XObject /Width 800\n>>\nstream\nGb"0WI8!UCReZCi7Knc)#sgl)M1ukB%[=uXkWf/C#q?,]CZ5W6\'TR(j5N>EqMQW!+kU,"u<eD=3BZCut@^(1]OgmlKR<@"q7N>LM[PG3U=N*35"WA6^`3+N;S=_aLn%&0r.LL?K5CZY(\\ol;>?>f"QXJ21l&-)\\1zzzzzzzzzzzzzzzzzz!596n+sKo_;GpFlJ+Aeqr:[V7IHs#qo0<8=St7Q[B$HcE?XM]3\'E"/5gU7u>=]nm"\\$3(j\'n#Pj([l\'C!!%Pr%3%3!"`8*nBk_:RJ,]8uj[i!6\\9$hn"TS!6s06P%_B,3([i)N#m6q!TJDO`f:+fnfS=#\\r

In [126]:
extract_email_data("./email_att.eml")

{'from': 'Yagnic Chandra <yagnic40@gmail.com>',
 'to': 'Yagnic Chandra <yagnic40@gmail.com>',
 'subject': 'Request for General Adjustment to Payment Terms',
 'body': 'Dear Banking Services Team,\\n\\nI hope this message finds you well.\\n\\nI am\nwriting to request a general adjustment regarding the payment terms\nassociated with our current facility. We would like to discuss modifying\nthe existing payment schedule due to recent changes in our cash flow\nmanagement strategy. Kindly let us know the documentation or procedural\nrequirements to initiate this adjustment.\\n\\nPlease confirm receipt of this\nrequest and advise on the next steps at your earliest convenience.\\n\\nThank\nyou for your assistance.\\n\\nBest regards,\\n\\nJohn Doe  \\nCorporate Finance\nManager  \\nXYZ Corporation\n',
 'attachments': [{'filename': 'sample_bs_clear.pdf',
   'content_type': 'application/pdf',
   'data': b'%PDF-1.7\n%\xe2\xe3\xcf\xd3\n4 0 obj\n<<\n/Type /XObject\n/Subtype /Image\n/Width 900\n/Heig

# Compiling email data into a dataframe

In [127]:
emails = []

for index in range(303):
    email_dict = extract_email_data(f"./emails/email{index}.eml")
    email_dict["attachment"] = "no"
    emails.append(email_dict)
    

In [128]:
final_df = pd.DataFrame(emails)

In [129]:
emails = []
for index in range(303):
    email_dict = extract_email_data(f"./emails_attachments/email_pdf_text_{index}.eml")
    email_dict["attachment"] = "pdf_text"
    emails.append(email_dict)
    

In [130]:
final_df_text = pd.DataFrame(emails)

In [131]:
emails = []
for index in range(303):
    email_dict = extract_email_data(f"./emails_image_attachments/email_pdf_text_{index}.eml")
    email_dict["attachment"] = "pdf_image"
    emails.append(email_dict)
    

In [132]:
final_df_image = pd.DataFrame(emails)

# Target Extraction

In [133]:
target_data = pd.read_csv("emails.csv")

In [134]:
target_data.head()

Unnamed: 0,index,email,time
0,0,"{'From': 'client.relations@example.com', 'To':...",38.141095
1,0,"{'From': 'client.relations@abcindustries.com',...",40.733314
2,1,"{'From': 'client.relations@example.com', 'To':...",39.430762
3,2,"{'From': 'client.services@domain.com', 'To': '...",36.887012
4,3,{'From': 'customer.relations@xyzcorporation.co...,40.739424


In [135]:
target_data.email.values[0]

"{'From': 'client.relations@example.com', 'To': 'banking.services@bigbank.com', 'Subject': 'Request for General Adjustment to Payment Terms', 'body': 'Dear Banking Services Team,\\n\\nI hope this message finds you well.\\n\\nI am writing to request a general adjustment regarding the payment terms associated with our current facility. We would like to discuss modifying the existing payment schedule due to recent changes in our cash flow management strategy. Kindly let us know the documentation or procedural requirements to initiate this adjustment.\\n\\nPlease confirm receipt of this request and advise on the next steps at your earliest convenience.\\n\\nThank you for your assistance.\\n\\nBest regards,\\n\\nJohn Doe  \\nCorporate Finance Manager  \\nXYZ Corporation', 'request_type': 'Adjustment', 'subrequest_type': 'General Adjustment'}"

In [136]:
final_df["request_type"] = target_data.email.apply(lambda x: eval(x)['request_type'])

In [137]:
final_df["subrequest_type"] = target_data.email.apply(lambda x: eval(x)['subrequest_type'])

In [138]:
final_df_text["request_type"] = target_data.email.apply(lambda x: eval(x)['request_type'])
final_df_text["subrequest_type"] = target_data.email.apply(lambda x: eval(x)['subrequest_type'])

In [139]:
final_df_image["request_type"] = target_data.email.apply(lambda x: eval(x)['request_type'])
final_df_image["subrequest_type"] = target_data.email.apply(lambda x: eval(x)['subrequest_type'])

In [140]:
final_df = pd.concat([final_df,final_df_text, final_df_image],axis=0)

In [141]:
final_df.shape

(909, 8)

In [142]:
final_df.isnull().sum()

from                 0
to                   0
subject              0
body               606
attachments          0
attachment           0
request_type         0
subrequest_type      0
dtype: int64

# LLM Based Classification

## Validation Code

In [143]:
class ExtractCorrectAnswer(BaseModel):
    request_type: str
    subrequest_type: str


    
    

In [160]:
prompt = """Consider yourself as a customer bank request classification expert who can classify any kind of customer bank request emails. You need to classify them into below request and subrequest categories {
  "Adjustment": {
    "General Adjustment": "For modifications in payment terms or amounts.",
    "subrequest": {
      "Rate Correction Adjustment": "Adjustments targeting specific rate errors.",
      "Data Correction": "Fixes for data entry mistakes."
    }
  },
  "AU Transfer": {
    "Standard AU Transfer": "Routine authorized transfers.",
    "subrequest": {
      "Intra-AU Reallocation": "Reassigning funds within segments under the same authorization.",
      "Cross-Entity Transfer": "Transfers between subsidiaries/legal entities within the same group.",
      "Split Transfer Request": "Single transfer split among multiple destination accounts."
    }
  },
  "Closing Notice": {
    "subrequest": {
      "Reallocation Fees": "",
      "Reallocation Principal": "",
      "Amendment Fees": "",
      "Partial Settlement Notice": "For partial settlements requiring separate tracking.",
      "Overpayment Notification": "To handle overpayments distinctly.",
      "Early Closure Notification": "Facility closed before scheduled date.",
      "Deferred or Delayed Closure": "Postponed closure due to issues.",
      "Document Submission Requirement": "Closure pending due to missing documentation."
    }
  },
  "Commitment Change": {
    "subrequest": {
      "Cashless Roll": "",
      "Decrease": "",
      "Increase": "",
      "Drawdown Revision": "Adjustments to scheduled drawdown amounts.",
      "Borrowing Base Recalculation": "Triggered by changes in collateral value.",
      "Commitment Reaffirmation": "Reaffirming existing commitment without altering financials.",
      "Term Extension": "Extending the facility term.",
      "Covenant Update": "Changes in covenants like financial ratios."
    }
  },
  "Fee Payment": {
    "subrequest": {
      "Ongoing Fee": "",
      "Letter of Credit Fee": "",
      "Retroactive Fee Correction": "Adjusting fees from previous periods.",
      "Fee Allocation Across Accounts": "Splitting fees across accounts.",
      "Fee Waiver Request": "Request to waive fees.",
      "Fee Reversal": "Reversing erroneously charged fees."
    }
  },
  "Money Movement Inbound": {
    "subrequest": {
      "Principal": "",
      "Interest": "",
      "Principal + Interest": "",
      "Principal + Interest + Fee": "",
      "Consolidated Payment Notification": "Aggregating multiple incoming payments.",
      "Segregated or Partitioned Payment": "Earmarking received funds for different purposes.",
      "Currency Conversion Inbound": "Inbound payment received in foreign currency.",
      "Escrow Payment": "Funds directed into escrow.",
      "Partial Payment": "Portion of expected payment received."
    }
  },
  "Money Movement Outbound": {
    "subrequest": {
      "Timebound": "",
      "Foreign Currency": "",
      "Failed Transfer Resolution": "Corrective action for failed outbound transfers.",
      "Reversal and Reissue": "Reversing and correctly reissuing outbound payments.",
      "Scheduled Outbound Payment Notification": "Future-dated outbound payment setup.",
      "Automated Reversal": "Automated error-triggered reversal process.",
      "Fee Refund": "Returning fees as part of outbound transaction."
    }
  },
  "Additional Process/Operational Exceptions": {
    "subrequest": {
      "System-Generated Error Correction": "Correcting system-generated errors.",
      "Manual Intervention Alert": "Request for manual review due to unexpected data."
    }
  },
  "Collateral Management": {
    "subrequest": {
      "Collateral Revaluation Request": "Request for updated or additional collateral.",
      "Collateral Deficiency Notification": "Insufficient collateral notification."
    }
  },
  "Documentation & Compliance": {
    "subrequest": {
      "Missing Documentation Follow-up": "Follow-up for unreceived documentation.",
      "Digital Signature Verification Issue": "Invalid digital signature flag."
    }
  }
}


Output format:{"request_type":, "subrequest_type":}. Please don't provide anything other than the json"""

In [145]:
def find_benefits(prompt: str, data_dict: str) -> dict:
    ollama_api_url = "http://localhost:11434/api/chat"

    # Ensure the prompt is structured correctly
    full_prompt = f"{prompt}\n\n email: {data_dict}"

    # Request payload for Ollama
    payload = {
        "model": "llama3.1",
        "messages": [{"role": "user", "content": full_prompt}],
        "options": {
            "seed": 101,
            "temperature": 0
        },
        "stream": False
    }

    try:


        error = True

        while error == True:
        
            response = requests.post(ollama_api_url, json=payload)
            response.raise_for_status()

            # Extract response JSON correctly
            response_json = response.json()
            model_response = response_json.get("message", {}).get("content", "")

            
            # model_response = model_response.split("</think>")[-1][9:-3]
            
            print(model_response)

            # Convert and validate JSON response
            parsed_response = json.loads(model_response)  # Extract valid JSON
            validated_response = ExtractCorrectAnswer(**parsed_response)
            
            return validated_response.model_dump()  # Return as dictionary
        
        

    except (json.JSONDecodeError, ValidationError, ValueError) as e:
        print(f"Error parsing model response: {e}")
        return  {"request_type": "manual", "subrequest_type":None}
    except requests.exceptions.RequestException as re:
        print(f"Request Error: {re}")
        return {"request_type": "manual", "subrequest_type":None}

    except Exception as e:
        print(f"Unexpected Error: {e}")
        return {"request_type": "manual", "subrequest_type":None}


##  Cleaning data for UTF-8

In [146]:
def clean_text(text):
    if isinstance(text, str):
        return text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
    return text

final_df['subject'] = final_df['subject'].apply(clean_text)
final_df['body'] = final_df['body'].apply(clean_text)
final_df['attachments'] = final_df['attachments'].apply(clean_text)


In [147]:
# Force all fields to string type and clean them
final_df['subject'] = final_df['subject'].astype(str).str.encode('utf-8', errors='ignore').str.decode('utf-8', errors='ignore')
final_df['body'] = final_df['body'].astype(str).str.encode('utf-8', errors='ignore').str.decode('utf-8', errors='ignore')
final_df['attachments'] = final_df['attachments'].astype(str).str.encode('utf-8', errors='ignore').str.decode('utf-8', errors='ignore')


## Final JSON Preparation

In [148]:
final_df_json = json.loads(final_df[['subject', 'body', 'attachments']].to_json(orient="records"))

## Test for LLM Response

In [158]:
data_dict = final_df_json[0]

In [161]:
find_benefits(prompt,data_dict)

{"request_type": "Adjustment", "subrequest_type": "General Adjustment"}


{'request_type': 'Adjustment', 'subrequest_type': 'General Adjustment'}

# PDF / Image OCR

In [151]:

def preprocess_image_for_ocr(pil_img: Image.Image) -> np.ndarray:
    """
    Convert a PIL Image to an OpenCV (NumPy) image, apply grayscale
    and adaptive threshold for better OCR on noisy/uneven scans.
    """
    img = np.array(pil_img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(
        gray,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31, 2
    )
    return thresh


def run_ocr_on_pil(pil_img: Image.Image) -> str:
    """
    Apply OpenCV preprocessing to a PIL Image,
    then run Tesseract with custom config.
    """
    processed = preprocess_image_for_ocr(pil_img)
    config_str = r"--oem 3 --psm 6"
    pil_processed = Image.fromarray(processed)
    text = pytesseract.image_to_string(pil_processed, config=config_str)
    return text


def ocr_pdf(pdf_data: bytes) -> str:
    """
    Convert PDF bytes to images (pdf2image), then
    do OCR on each page. Returns full text.
    """
    if not HAVE_PDF2IMAGE:
        return "[Error: pdf2image not installed, cannot OCR PDF]"
    
    pages = convert_from_bytes(pdf_data)
    all_texts = []
    for page_img in pages:
        page_text = run_ocr_on_pil(page_img)
        all_texts.append(page_text)
    return "\n".join(all_texts)


def ocr_image(img_data: bytes) -> str:
    """
    Load the raw image (PNG/JPG/etc.) into PIL,
    do OpenCV-based preprocessing, and run Tesseract.
    """
    with io.BytesIO(img_data) as buf:
        pil_img = Image.open(buf).convert("RGB")
        text = run_ocr_on_pil(pil_img)
    return text





In [152]:
ocr_pdf(eval(final_df.tail().attachments.values[0])[0]['data'])

'Hi Team\n\nPlease proceed with initiating a standard authorized transfer from our\nOperating account ending in 8921 toour project reserve accountending\njin 4476 for the amountof AUD 1,250,000. This is part of our scheduled\nfund movement tosupport Q2 disbursements.\n\nLetme knowonce the transfer has been processed, or if you require\n\nany additional approvals or documentation.\n\nThanks,\n\nAndrew\n\x0c'

In [153]:
eval(final_df.tail().attachments.values[0])[0]['data']

b'%PDF-1.3\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com\n1 0 obj\n<<\n/F1 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font\n>>\nendobj\n3 0 obj\n<<\n/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 256 /Length 13876 /Subtype /Image \n  /Type /XObject /Width 800\n>>\nstream\nGb"0WI8#o-T"sk5[_,/8STR=tFJGOh$]Z@@>:jff+Q1H&J0ph=0,JC_b;S*uGG8bbACI!LXdu^_*Ct:p>\'17^Z`q#YMkm[3\\kZeERB@#;E"K&-$_d\\H<C$\'A0:_[Ve)rjfH9h_an)ob$F3d#4G@$l=S0.L^MTZ/\'zzzzzzzzzzzzzzzzzz!!!!pKVtGk>(9^]?!UQ7BN1:`QH&[jnaTJ1cT_5BW$_oP!:YfKc*+cr$ig8-S\\+gB2O-$I>?b<u041<8PS=oM=&i8^l@6u--71&`,%BlhX[_@$X&i)d2Q`pV8/b9p3n8H]!!(q32Rr["c\'gL;:S*ffq=Ebo,9s%HPEU(SfWjboYD7daW&B*RIaU\\(N#FQGEj0WJc5A89!!#.$!WiG_:#r:RpRGTEL5$UP_SX09H[@h83d0rHR@BZb(`6pSQ?lV^U=IlIk:S(]!5N)VE:7..Qn[LmmFtO#`2U&J;@9D&WKLUG2)PO@"h>"0IK]]OScj\\hYEgbE[X\\=+.3(O!Gent8;A_K=CjUMU?!X#)YHPQ7Wc7+ER0UG5!!%P3@rV6D@q2:0<=Xd/0eor

In [154]:
final_df_json[0]

{'subject': 'Request for General Adjustment to Payment Terms',
 'body': 'Dear Banking Services Team,\n\nI hope this message finds you well.\n\nI am writing to request a general adjustment regarding the payment terms associated with our current facility. We would like to discuss modifying the existing payment schedule due to recent changes in our cash flow management strategy. Kindly let us know the documentation or procedural requirements to initiate this adjustment.\n\nPlease confirm receipt of this request and advise on the next steps at your earliest convenience.\n\nThank you for your assistance.\n\nBest regards,\n\nJohn Doe  \nCorporate Finance Manager  \nXYZ Corporation\n',
 'attachments': '[]'}

In [155]:
final_df.head()

Unnamed: 0,from,to,subject,body,attachments,attachment,request_type,subrequest_type
0,client.relations@example.com,banking.services@bigbank.com,Request for General Adjustment to Payment Terms,"Dear Banking Services Team,\n\nI hope this mes...",[],no,Adjustment,General Adjustment
1,client.relations@abcindustries.com,support@bigbank.com,Request for Payment Term Adjustment,"Dear Support Team,\n\nI hope this message find...",[],no,Adjustment,General Adjustment
2,client.relations@example.com,bank.operations@bankexample.com,Request for Interest Rate Correction Adjustment,"Dear Team,\n\nI hope this message finds you we...",[],no,Adjustment,Rate Correction Adjustment
3,client.services@domain.com,support@bankingcorp.com,Request for Data Correction in Transaction Rec...,"Dear Support Team,\n\nI hope this message find...",[],no,Adjustment,Data Correction
4,customer.relations@xyzcorporation.com,transfersupport@bankservices.com,Request for Standard Authorized Transfer,"Dear Team,\n\nWe would like to initiate a stan...",[],no,AU Transfer,Standard AU Transfer


In [48]:
final_df.tail().body.values[0]

'None'

In [49]:
outputs = []

In [None]:
for jso in tqdm(final_df_json):
    temp_json = eval(jso["attachments"])
    if temp_json != []:

        text = ""
        
        for index,attachment in enumerate(temp_json):
            print(attachment)
            attachment_text = ocr_pdf(attachment['data'])
            text += text + f"attachement_{index}" + attachment_text

        final_text_sent_model = text + jso["subject"] + jso["body"]
        output = find_benefits(prompt,final_text_sent_model)
        outputs.append(output)

    else:

        final_text_sent_model = jso["subject"] + jso["body"]
        output = find_benefits(prompt,final_text_sent_model)
        outputs.append(output)








In [50]:
from multiprocessing import Pool, cpu_count
from tqdm import tqdm


def process_jso(args):
    """
    Processes a single jso dictionary.

    Args:
        args (tuple): Typically (jso, prompt)

    Returns:
        The output from find_benefits(...) for this jso
    """
    jso, prompt = args

    temp_json = eval(jso["attachments"])

    if temp_json:
        text = ""
        for index, attachment in enumerate(temp_json):
            attachment_text = ocr_pdf(attachment["data"]) 
            text += f"attachement_{index}{attachment_text}"


        final_text_sent_model = text + jso["subject"] + jso["body"]

    else:
        final_text_sent_model = jso["subject"] + jso["body"]


    output = find_benefits(prompt, final_text_sent_model)
    return output


def parallel_process(final_df_json, prompt):
    """
    Runs 'process_jso' in parallel over final_df_json,
    preserving order. Returns a list of results in the
    same order as final_df_json.
    """

    args_list = [(jso, prompt) for jso in final_df_json]


    with Pool(processes=cpu_count()) as pool:

        results = list(tqdm(pool.map(process_jso, args_list),
                            total=len(args_list),
                            desc="Processing JSON items"))

    return results




 

# Llama 3.1:8b

In [162]:
outputs = parallel_process(final_df_json, prompt)
print("All done!")


{"request_type": "Money Movement Outbound", "subrequest_type": "Scheduled Outbound Payment Notification"}{"request_type": "Closing Notice", "subrequest_type": "Document Submission Requirement"}

{"request_type": "Fee Payment", "subrequest_type": "Fee Reversal"}
{"request_type": "Fee Payment", "subrequest_type": "Fee Allocation Across Accounts"}
{"request_type": "Adjustment", "subrequest_type": "General Adjustment"}
{"request_type": "Fee Payment", "subrequest_type": "Retroactive Fee Correction"}
{"request_type": "AU Transfer", "subrequest_type": "Intra-AU Reallocation"}
{"request_type": "Commitment Change", "subrequest_type": "Borrowing Base Recalculation"}
{"request_type": "Fee Payment", "subrequest_type": "Letter of Credit Fee"}
{"request_type": "Fee Payment", "subrequest_type": "Ongoing Fee"}
{"request_type": "Money Movement Inbound", "subrequest_type": "Consolidated Payment Notification"}
{"request_type": "Documentation & Compliance", "subrequest_type": "Digital Signature Verificati

Processing JSON items: 100%|██████████| 909/909 [00:00<00:00, 691694.91it/s]

All done!





In [163]:
outputs[-1]

{'request_type': 'Closing Notice', 'subrequest_type': 'Reallocation Fees'}

In [164]:
outputs_df = pd.DataFrame(outputs)

In [165]:
final_df.request_type.unique()

array(['Adjustment', 'AU Transfer', 'Closing Notice', 'Commitment Change',
       'Fee Payment', 'Money Movement Inbound', 'Money Movement Outbound',
       'Additional Process/Operational Exceptions',
       'Collateral Management', 'Documentation & Compliance'],
      dtype=object)

In [166]:
outputs_df.request_type.unique()

array(['Adjustment', 'AU Transfer', 'Closing Notice', 'Commitment Change',
       'Fee Payment', 'Money Movement Inbound', 'Money Movement Outbound',
       'Additional Process/Operational Exceptions',
       'Collateral Management', 'Documentation & Compliance'],
      dtype=object)

In [167]:
outputs_df.isnull().sum()

request_type       0
subrequest_type    0
dtype: int64

In [168]:
final_df["combined_target"] = final_df["request_type"] + "-" + final_df["subrequest_type"]

In [169]:
outputs_df["combined_target"] = outputs_df["request_type"] + "-" + outputs_df["subrequest_type"]

In [170]:
final_df["prediction"] = outputs_df["combined_target"]

In [171]:
final_df.isnull().sum()

from               0
to                 0
subject            0
body               0
attachments        0
attachment         0
request_type       0
subrequest_type    0
combined_target    0
prediction         0
dtype: int64

In [172]:
final_df["prediction"] = final_df.prediction.fillna("")

# Performance

In [174]:
print(classification_report(final_df["combined_target"],final_df["prediction"]))

                                                                             precision    recall  f1-score   support

                                          AU Transfer-Cross-Entity Transfer       1.00      1.00      1.00        21
                                          AU Transfer-Intra-AU Reallocation       1.00      1.00      1.00        21
                                         AU Transfer-Split Transfer Request       1.00      1.00      1.00        21
                                           AU Transfer-Standard AU Transfer       1.00      1.00      1.00        21
        Additional Process/Operational Exceptions-Manual Intervention Alert       1.00      1.00      1.00        18
Additional Process/Operational Exceptions-System-Generated Error Correction       1.00      1.00      1.00        18
                                                 Adjustment-Data Correction       1.00      1.00      1.00        18
                                              Adjustment-Genera

# Extraction of related data

In [176]:
class ExtractCorrectAnswer(BaseModel):

    account_number: str| None = None
    principle_amount:float| None = None
    interest:float| None = None
    fees:float| None = None
    escalation:str| None = None
    appreciation:str| None = None


    
    

In [177]:
prompt = """Consider yourself as a customer bank request classification expert who can classify any kind of customer bank request emails. You need to  identify account_number, principle_amount, interest, fees, escalation, appreciation. If any of these are not present return null. Also, just give the json output and nothing else.

.Output should only be a json with all these keys present in json. If any of the keys doesn't have value give null as value { "account_number":float, "principle_amount":float, "interest":float, "fees":float , "escalation":str ,"appreciation": str} Please don't provide anything other than the json"""

In [178]:
def find_attributes(prompt: str, data_dict: str) -> dict:
    ollama_api_url = "http://localhost:11434/api/chat"

    # Ensure the prompt is structured correctly
    full_prompt = f"{prompt}\n\n email: {data_dict}"

    # Request payload for Ollama
    payload = {
        "model": "llama3.1",
        "messages": [{"role": "user", "content": full_prompt}],
        "options": {
            "seed": 101,
            "temperature": 0
        },
        "stream": False
    }

    try:


        error = True

        while error == True:
        
            response = requests.post(ollama_api_url, json=payload)
            response.raise_for_status()

            # Extract response JSON correctly
            response_json = response.json()
            model_response = response_json.get("message", {}).get("content", "")

            
            # model_response = model_response.split("</think>")[-1][9:-3]
            
            print(model_response)

            # Convert and validate JSON response
            parsed_response = json.loads(model_response)  # Extract valid JSON
            validated_response = ExtractCorrectAnswer(**parsed_response)
            
            return validated_response.model_dump()  # Return as dictionary
        
        

    except (json.JSONDecodeError, ValidationError, ValueError) as e:
        print(f"Error parsing model response: {e}")
        return  {"request_type": "manual", "subrequest_type":None}
    except requests.exceptions.RequestException as re:
        print(f"Request Error: {re}")
        return {"request_type": "manual", "subrequest_type":None}

    except Exception as e:
        print(f"Unexpected Error: {e}")
        return {"request_type": "manual", "subrequest_type":None}


In [179]:
from multiprocessing import Pool, cpu_count
from tqdm import tqdm


def process_jso(args):
    """
    Processes a single jso dictionary.

    Args:
        args (tuple): Typically (jso, prompt)

    Returns:
        The output from find_benefits(...) for this jso
    """
    jso, prompt = args

    temp_json = eval(jso["attachments"])

    if temp_json:
        text = ""
        for index, attachment in enumerate(temp_json):
            attachment_text = ocr_pdf(attachment["data"]) 
            text += f"attachement_{index}{attachment_text}"


        final_text_sent_model = text + jso["subject"] + jso["body"]

    else:
        final_text_sent_model = jso["subject"] + jso["body"]


    output = find_attributes(prompt, final_text_sent_model)
    return output


def parallel_process(final_df_json, prompt):
    """
    Runs 'process_jso' in parallel over final_df_json,
    preserving order. Returns a list of results in the
    same order as final_df_json.
    """

    args_list = [(jso, prompt) for jso in final_df_json]


    with Pool(processes=cpu_count()) as pool:

        results = list(tqdm(pool.map(process_jso, args_list),
                            total=len(args_list),
                            desc="Processing JSON items"))

    return results




 

In [180]:
outputs_attributes = parallel_process(final_df_json, prompt)
print("All done!")


{
  "account_number": null,
  "principle_amount": null,
  "interest": null,
  "fees": null,
  "escalation": null,
  "appreciation": null
}
{
  "account_number": null,
  "principle_amount": null,
  "interest": null,
  "fees": 0.0,
  "escalation": null,
  "appreciation": null
}
{
  "account_number": null,
  "principle_amount": 3250000.0,
  "interest": null,
  "fees": null,
  "escalation": null,
  "appreciation": null
}
{
    "account_number": 9876543210,
    "principle_amount": 1200000.0,
    "interest": null,
    "fees": null,
    "escalation": null,
    "appreciation": null
}
Error parsing model response: 1 validation error for ExtractCorrectAnswer
account_number
  Input should be a valid string [type=string_type, input_value=9876543210, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
{
  "account_number": null,
  "principle_amount": null,
  "interest": null,
  "fees": null,
  "escalation": null,
  "appreciation": null
}
{
  "account_num

Processing JSON items: 100%|██████████| 909/909 [00:00<00:00, 1827719.24it/s]

All done!





In [181]:
outputs_df_attributes = pd.DataFrame(outputs_attributes)

In [182]:
outputs_df_attributes.shape

(909, 8)

In [190]:
final_df = final_df.reset_index(drop=True)

In [195]:
final_df_attributes = final_df.copy(deep=True)

In [196]:
for col in outputs_df_attributes.columns:
    final_df_attributes[col] = outputs_df_attributes[col]


In [197]:
final_df_attributes.tail()

Unnamed: 0,from,to,subject,body,attachments,attachment,request_type,subrequest_type,combined_target,prediction,account_number,principle_amount,interest,fees,escalation,appreciation
904,andrew.miller@sunstoneholdings.com,operations@globalfinancecorp.com,Initiation of Authorized Transfer Request,,"[{'filename': 'body_image.pdf', 'content_type'...",pdf_image,manual,,AU Transfer-Standard AU Transfer,AU Transfer-Standard AU Transfer,,,,,,
905,finops_team@clientco.com,relationship.manager@bankingpartner.com,Request for Intra-AU Reallocation of Funds,,"[{'filename': 'body_image.pdf', 'content_type'...",pdf_image,,,AU Transfer-Intra-AU Reallocation,AU Transfer-Intra-AU Reallocation,,,,,,
906,finance.operations@abcgroup.com,clientservices@bigbank.com,Request for Cross-Entity AU Transfer Processing,,"[{'filename': 'body_image.pdf', 'content_type'...",pdf_image,,,AU Transfer-Cross-Entity Transfer,AU Transfer-Cross-Entity Transfer,,2500000.0,,,,
907,client.operations@businesscorp.com,transfersupport@bankcorp.com,Request to Initiate Split Transfer Under Exist...,,"[{'filename': 'body_image.pdf', 'content_type'...",pdf_image,,,AU Transfer-Split Transfer Request,AU Transfer-Split Transfer Request,,1200000.0,,,,
908,client.notifications@domain.com,bank.operations@bankdomain.com,Clarification Needed on Reallocation Fees Duri...,,"[{'filename': 'body_image.pdf', 'content_type'...",pdf_image,,,Closing Notice-Reallocation Fees,Closing Notice-Reallocation Fees,,,,0.0,,
