In [7]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    # Open the provided PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Extract text from all pages
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        full_text += page.get_text("text")  # Get text from each page

    return full_text

def extract_schedule_h(pdf_path, section):

    pdf_text = extract_text_from_pdf(pdf_path)
    
    schd_start = ''
    schd_end = ''
    if section == 'h1':
        schd_start = "H.1 - Corporate Loan Data Schedule"
        schd_end = "H.2 – Commercial Real Estate Schedule"
    else:
        schd_start = "H.2 – Commercial Real Estate Schedule"
        schd_end = "H.3 – Line of Business Schedule"

    # Find the start of the Schedule H section in the extracted text
    schedule_h_start = pdf_text.find(schd_start)
    
    if schedule_h_start == -1:
        return "Schedule H not found in the document."

    # Extract the text starting from "Schedule H" and onward
    schedule_h_text = pdf_text[schedule_h_start:schedule_h_start + 1000000]
    
    schedule_h_end = schedule_h_text.find(schd_end)
    
    if schedule_h_end != -1:
        # If "Schedule I" is found, assume it's the end of Schedule H section
        schedule_h_text = schedule_h_text[:schedule_h_end]
    
    # Return the extracted Schedule H section
    return schedule_h_text.strip()
        
pdf_path = 'FR_Y-14Q20240331_i.pdf'
schedule_h_data = extract_schedule_h(pdf_path,"h1")
print(schedule_h_data)

H.1 - Corporate Loan Data Schedule 
 
The Corporate Loan Data Schedule collects loan level detail on corporate loans and leases. The data 
collection has two sections: (1) Loan and Obligor Description section (Fields 1 through 51, and Fields 
83 through 108), which collects information related to the obligor and the loan itself; and (2) Obligor 
Financial Data section (Fields 52 through 82), which collects data related to the financial health of 
the obligor or the entity that is the primary source of repayment for the loan. Both sections are 
completed at a loan level detail.  
A. Loan Population 
 
The loan population includes corporate loans and leases that are held for investment (HFI) (as 
defined in the FR Y-9C, Schedule HC-C General Instructions) and held for sale (HFS) as of the report 
date.  Include HFI and HFS loans that the holding company has elected to report at fair value under 
the fair value option. Exclude all loans and leases classified as trading (reportable on the 

In [1]:
import re
import json
import pdfplumber
from transformers import pipeline
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def validate_psrlei(psr, obligor, lei):
    """
    Validate Primary Source of Repayment LEI (PSRLEI) based on FRY14Q rules using LLM.
    :param psr: Primary Source of Repayment (Field 50)
    :param obligor: Obligor (Field 2 or 4)
    :param lei: LEI provided (Field 112)
    :return: Validation result (Valid or Error Message)
    """
    generator = pipeline("text-generation", model="gpt-3.5-turbo")
    
    prompt = f"""
    Given the following inputs:
    - Primary Source of Repayment (PSR): {psr}
    - Obligor: {obligor}
    - LEI: {lei}
    
    Apply the following validation rules:
    1. If PSR = Obligor, LEI must be blank.
    2. If PSR ≠ Obligor, LEI must be a valid 20-character alphanumeric code or 'NA'.
    3. If LEI is invalid, return an appropriate error message.
    
    Return 'Valid' if all rules are met, otherwise return the specific error message.
    """
    
    result = generator(prompt, max_length=100, do_sample=False)[0]['generated_text']
    return result

def extract_tables_from_pdf(pdf_path, schedule="H.1", start_text="H.1 - Corporate Loan Data Schedule", end_text="H.2 – Commercial Real Estate Schedule"):

    """Extracts tables only from the specified schedule (H.1) between start and end markers."""
    table_data = []
    with pdfplumber.open(pdf_path) as pdf:
        extract = False
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                if start_text in text:
                    extract = True
                if extract:
                    tables = page.extract_table()
                    if tables:
                        table_data.extend(tables)  # Collect all table rows
                if end_text in text:
                    extract = False
    return table_data

def clean_text(text):
    """Cleans extracted text by removing backslashes, forward slashes, and extra spaces."""
    if text:
        return re.sub(r"[\\/]+", "", text).strip()
    return ""

def process_table_data(table_data):
    """Processes extracted table data into structured fields with complete descriptions."""
    fields = {}
    for row in table_data:
        if len(row) >= 5:  # Ensuring the row has enough columns
            field_no, field_name, technical_name, description, constraints = map(clean_text, row[:5])
            
            if field_name and technical_name:  # Ensure valid field entries
                if field_name in fields:  # If field already exists, append to description (handles multi-line cases)
                    fields[field_name]["description"] += " " + description
                else:
                    fields[field_name] = {
                        "technical_name": technical_name,
                        "description": description,
                        "constraints": constraints,
                        "mandatory": True,  # Assuming all fields are mandatory
                        "type": "string"  # Defaulting to string; needs refinement
                    }
    return fields

def generate_rule_dictionary(fields):
    """Generates validation rules dynamically based on extracted fields."""
    rule_dict = {}
    for field, properties in fields.items():
        rule_dict[field] = {
            "type": properties["type"],
            "mandatory": properties["mandatory"],
            "description": properties["description"],
            "constraints": properties["constraints"]
        }
    return rule_dict

# Path to the uploaded PDF file
pdf_path = "FR_Y-14Q20240331_i.pdf"

# Extract structured table data from Schedule H.1 within defined section
pdf_table_data = extract_tables_from_pdf(pdf_path, schedule="H.1", start_text="H.1 - Corporate Loan Data Schedule", end_text="H.2 – Commercial Real Estate Schedule")

# Process extracted table data into structured fields
extracted_fields = process_table_data(pdf_table_data)

# Generate validation rule dictionary
rule_dictionary = generate_rule_dictionary(extracted_fields)

# Output generated rule dictionary
print(json.dumps(rule_dictionary, indent=4))


{
    "Field Name;\n(Technical Field\nName)": {
        "type": "string",
        "mandatory": true,
        "description": "Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description Description",
        "constraints": "Allowable Values"
    },
    "Customer ID\n(CustomerID)": {
        "type": "string",
        "mandatory": true,
        "description": "Report the unique internal identifier for the customer relationship\nunder which the obligor's exposure is aggre

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import nltk
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK sentence tokenizer
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def summarize_with_embeddings(text):
    """Generate embeddings for each sentence and return the most representative ones."""
    # Tokenize sentences properly
    sentences = sent_tokenize(text)

    # Generate embeddings for each sentence
    sentence_embeddings = model.encode(sentences)

    # Compute the overall document embedding as the mean of all sentence embeddings
    doc_embedding = np.mean(sentence_embeddings, axis=0).reshape(1, -1)

    # Compute cosine similarity between each sentence embedding and the document embedding
    similarities = cosine_similarity(sentence_embeddings, doc_embedding).flatten()

    # Select the top 2 most representative sentences
    top_indices = similarities.argsort()[-2:][::-1]
    summary = ' '.join([sentences[i] for i in top_indices])

    return summary

# Example Text Input
text = """
Customer ID 
(CustomerID) 
CLCOM047 Report the unique internal identifier for the customer relationship 
under which the obligor's exposure is aggregated in the reporting 
entity's credit systems. Customer ID is a relationship concept 
under which multiple borrowers are aggregated because they have 
related risks, including, but not limited to parent/subsidiary 
relationships. For stand-alone or ultimate parent obligors, the 
Customer ID may be the same as the unique internal identifier for 
the obligor provided in Field 2. 
Must not contain a carriage 
return, line feed, comma or any 
unprintable character. 
"""

# Get the summary
summary_with_embeddings = summarize_with_embeddings(text)
print("Summary Based on Embeddings:")
print(summary_with_embeddings)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Summary Based on Embeddings:

Customer ID 
(CustomerID) 
CLCOM047 Report the unique internal identifier for the customer relationship 
under which the obligor's exposure is aggregated in the reporting 
entity's credit systems. Customer ID is a relationship concept 
under which multiple borrowers are aggregated because they have 
related risks, including, but not limited to parent/subsidiary 
relationships.
