In [3]:
import re

with open('file3.txt', 'r') as file:
    text = file.read()


legal_agreements = re.findall(r'Legal Agreement\n\n(.*?)\n\n', text, re.DOTALL)
legal_paragraphs = re.findall(r'Legal Agreement\n\n.*?\n\n(.*?)\n\n', text, re.DOTALL)

for i, (agreement, paragraph) in enumerate(zip(legal_agreements, legal_paragraphs)):
    print(f"Legal Agreement {i + 1}:\n{agreement}\n")
    print(f"Corresponding Paragraph:\n{paragraph}\n")


Legal Agreement 1:
Payments shall be made to such address as may from time to time be designated by any holder.
The undersigned and all other parties to this note, whether as endorsers, guarantors or sureties, shall remain fully bound until this note is paid and waive demand, presentment and protest and all notices thereto and further agree to remain bound, notwithstanding any extension, modification, waiver, or other indulgence or discharge or release of any obligor hereunder or exchange, substitution, or release of any collateral granted as security for this note. No modification or indulgence by any holder hereof shall be binding unless in writing; and any indulgence on any one occasion shall not be an indulgence for any other or future occasion. 

Corresponding Paragraph:

The rights of any holder hereof shall be cumulative and not necessarily successive. This note shall take effect as a sealed instrument and be governed and enforced in accordance with the laws of _________________

<b> Model-1 Existing Model

In [8]:

import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
#Initialize spaCy
nlp=spacy.load('en_core_web_sm')
#Mortgage-related training data
sentences=[
"In this Mortgage Deed, Bank of America, the Lender, provides a the loan of $500,000 to John Doe, the Borrower, secured by the property at 123 Elm St.",]
#Corresponding named entity annotations (simplified)
#0 for 'Other', 1 for 'Person', 2 for Property Address', 3 for 'Bank (Lender)', 4 for Loan Amount", 5 for 'Document Type'
labels =[ [0, 0, 5, 5, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0],]
#Tokenization
token_lists = [list(nlp(sentence)) for sentence in sentences]
#Validate token and label lengths
for idx, tokens in enumerate(token_lists):
  assert len(tokens)== len(labels[idx]),f" Mismatch in sentence {idx + 1}, Tokens: {len(tokens)}, Labels: {len(labels[idx])}"
#Convert token vectors to embeddings
embeddings = [np.array([token.vector for token in token_list]) for token_list in token_lists]
embeddings = [torch.tensor(emb) for emb in embeddings]
X_train = torch.cat(embeddings, dim=0)
Y_train = torch.tensor([label for sublist in labels for label in sublist])


In [18]:
class SimpleNERModel(nn.Module):
  def __init__(self, input_dim, num_labels):
    super(SimpleNERModel, self).__init__()
    self.fc= nn.Linear(input_dim, num_labels)
  def forward(self, x):
    return self.fc(x)


In [20]:
# Initialize and train the model
input_dim=X_train.size(1)
num_labels = 6 # 'Other', 'Person', 'Property Address'
model = SimpleNERModel(input_dim, num_labels)
criterion = nn.CrossEntropyLoss()
optimizer =  torch.optim.Adam(model.parameters(), lr=0.01)


In [22]:
# Train the model (simplified)
for epoch in range(100):
  optimizer.zero_grad()
  outputs= model(X_train)
  loss = criterion(outputs, Y_train)
  loss.backward()
  optimizer.step()
  # Predict named entities and get confidence scores for a new sentence
new_sentence = "This Deed of Trust was made between David Lee as Trustee and Michelle King as Beneficiary for 789 Oak Rd with a sum of $250,000."
new_doc = nlp(new_sentence)
new_embeddings = torch.tensor([token.vector for token in new_doc])
with torch.no_grad():
  predictions = model(new_embeddings)
  softmax_outputs = F.softmax(predictions, dim=1)
  predicted_classes = torch.argmax(softmax_outputs, dim=1)
  confidence_scores = torch.max(softmax_outputs, dim=1)[0]
label_map = {0: 'Other', 1: 'Person', 2: 'Property Address', 3:'Bank (Lender)', 4: 'Loan Amount', 5: 'Document Type'}
for token, label, confidence in zip(new_doc, predicted_classes, confidence_scores):
    print(f"Token: {token}, Label: {label_map[label.item()]}, Confidence: {confidence.item():4f}")


Token: This, Label: Other, Confidence: 0.874897
Token: Deed, Label: Other, Confidence: 0.938659
Token: of, Label: Bank (Lender), Confidence: 0.642777
Token: Trust, Label: Bank (Lender), Confidence: 0.637639
Token: was, Label: Loan Amount, Confidence: 0.401456
Token: made, Label: Other, Confidence: 0.972322
Token: between, Label: Other, Confidence: 0.963456
Token: David, Label: Person, Confidence: 0.591018
Token: Lee, Label: Bank (Lender), Confidence: 0.962334
Token: as, Label: Property Address, Confidence: 0.506807
Token: Trustee, Label: Bank (Lender), Confidence: 0.701529
Token: and, Label: Person, Confidence: 0.803405
Token: Michelle, Label: Bank (Lender), Confidence: 0.802453
Token: King, Label: Bank (Lender), Confidence: 0.867991
Token: as, Label: Bank (Lender), Confidence: 0.791308
Token: Beneficiary, Label: Other, Confidence: 0.609886
Token: for, Label: Property Address, Confidence: 0.422601
Token: 789, Label: Other, Confidence: 0.527881
Token: Oak, Label: Property Address, Confi

<b> Model2

In [23]:
import tensorflow as tf
import numpy as np

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Mortgage-related training data
sentences = [
    "In this Mortgage Deed, Bank of America, the Lender, provides a the loan of $500,000 to John Doe, the Borrower, secured by the property at 123 Elm St.",
]

# Corresponding named entity annotations (simplified)
# 0 for 'Other', 1 for 'Person', 2 for Property Address', 3 for 'Bank (Lender)', 4 for Loan Amount", 5 for 'Document Type'
labels = [[0, 0, 5, 5, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0],]

# Tokenization
token_lists = [list(nlp(sentence)) for sentence in sentences]

# Validate token and label lengths
for idx, tokens in enumerate(token_lists):
    assert len(tokens) == len(labels[idx]), f" Mismatch in sentence {idx + 1}, Tokens: {len(tokens)}, Labels: {len(labels[idx])}"

# Convert token vectors to embeddings
embeddings = [np.array([token.vector for token in token_list]) for token_list in token_lists]
embeddings = [tf.convert_to_tensor(emb, dtype=tf.float32) for emb in embeddings]
X_train = tf.concat(embeddings, axis=0)
Y_train = tf.convert_to_tensor([label for sublist in labels for label in sublist], dtype=tf.int32)

class SimpleNERModel(tf.keras.Model):
    def __init__(self, num_labels):
        super(SimpleNERModel, self).__init__()
        self.fc = tf.keras.layers.Dense(num_labels)

    def call(self, inputs):
        return self.fc(inputs)
# Initialize and train the model
input_dim = X_train.shape[1]
num_labels = 6  # 'Other', 'Person', 'Property Address'
model = SimpleNERModel(num_labels)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Train the model (simplified)
for epoch in range(100):
    with tf.GradientTape() as tape:
        outputs = model(X_train)
        loss = criterion(Y_train, outputs)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Predict named entities and get confidence scores for a new sentence
new_sentence = "This Deed of Trust was made between David Lee as Trustee and Michelle King as Beneficiary for 789 Oak Rd with a sum of $250,000."
new_doc = nlp(new_sentence)
new_embeddings = tf.convert_to_tensor([token.vector for token in new_doc], dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    predictions = model(new_embeddings)
    softmax_outputs = tf.nn.softmax(predictions, axis=1)
    predicted_classes = tf.argmax(softmax_outputs, axis=1)
    confidence_scores = tf.reduce_max(softmax_outputs, axis=1)

label_map = {0: 'Other', 1: 'Person', 2: 'Property Address', 3:'Bank (Lender)', 4: 'Loan Amount', 5: 'Document Type'}
for token, label, confidence in zip(new_doc, predicted_classes, confidence_scores):
    print(f"Token: {token}, Label: {label_map[label.numpy()]}, Confidence: {confidence.numpy():.4f}")


Token: This, Label: Other, Confidence: 0.5528
Token: Deed, Label: Other, Confidence: 0.7948
Token: of, Label: Loan Amount, Confidence: 0.7387
Token: Trust, Label: Other, Confidence: 0.9359
Token: was, Label: Person, Confidence: 0.6197
Token: made, Label: Other, Confidence: 0.8588
Token: between, Label: Other, Confidence: 0.8281
Token: David, Label: Person, Confidence: 0.6068
Token: Lee, Label: Bank (Lender), Confidence: 0.9006
Token: as, Label: Property Address, Confidence: 0.9094
Token: Trustee, Label: Other, Confidence: 0.6196
Token: and, Label: Property Address, Confidence: 0.4198
Token: Michelle, Label: Bank (Lender), Confidence: 0.6985
Token: King, Label: Bank (Lender), Confidence: 0.9174
Token: as, Label: Bank (Lender), Confidence: 0.6726
Token: Beneficiary, Label: Other, Confidence: 0.7662
Token: for, Label: Other, Confidence: 0.3868
Token: 789, Label: Other, Confidence: 0.6133
Token: Oak, Label: Property Address, Confidence: 0.8091
Token: Rd, Label: Other, Confidence: 0.6973
To

<b> Model-3

In [25]:
import tensorflow as tf
import numpy as np

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Mortgage-related training data
sentences = [
    "In this Mortgage Deed, Bank of America, the Lender, provides a the loan of $500,000 to John Doe, the Borrower, secured by the property at 123 Elm St.",
]

# Corresponding named entity annotations (simplified)
# 0 for 'Other', 1 for 'Person', 2 for Property Address', 3 for 'Bank (Lender)', 4 for Loan Amount", 5 for 'Document Type'
labels = [[0, 0, 5, 5, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0],]

# Tokenization
token_lists = [list(nlp(sentence)) for sentence in sentences]

# Validate token and label lengths
for idx, tokens in enumerate(token_lists):
    assert len(tokens) == len(labels[idx]), f" Mismatch in sentence {idx + 1}, Tokens: {len(tokens)}, 
    Labels: {len(labels[idx])}"

# Convert token vectors to embeddings
embeddings = [np.array([token.vector for token in token_list]) for token_list in token_lists]
embeddings = [tf.convert_to_tensor(emb, dtype=tf.float32) for emb in embeddings]
X_train = tf.concat(embeddings, axis=0)
Y_train = tf.convert_to_tensor([label for sublist in labels for label in sublist], dtype=tf.int32)

class SimpleNERModel(tf.keras.Model):
    def __init__(self, num_labels):
        super(SimpleNERModel, self).__init__()
        self.fc = tf.keras.layers.Dense(num_labels)

    def call(self, inputs):
        return self.fc(inputs)
# Initialize and train the model
input_dim = X_train.shape[1]
num_labels = 6  # 'Other', 'Person', 'Property Address'
model = SimpleNERModel(num_labels)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)  # Increase the learning rate

# Train the model (increased epochs)
num_epochs = 300
for epoch in range(num_epochs):
    with tf.GradientTape() as tape:
        outputs = model(X_train)
        loss = criterion(Y_train, outputs)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Predict named entities and get confidence scores for a new sentence
new_sentence = "This Deed of Trust was made between David Lee as Trustee and Michelle King as Beneficiary for 789 Oak Rd with a sum of $250,000."
new_doc = nlp(new_sentence)
new_embeddings = tf.convert_to_tensor([token.vector for token in new_doc], dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    predictions = model(new_embeddings)
    softmax_outputs = tf.nn.softmax(predictions, axis=1)
    predicted_classes = tf.argmax(softmax_outputs, axis=1)
    confidence_scores = tf.reduce_max(softmax_outputs, axis=1)

label_map = {0: 'Other', 1: 'Person', 2: 'Property Address', 3:'Bank (Lender)', 4: 'Loan Amount', 5: 'Document Type'}
for token, label, confidence in zip(new_doc, predicted_classes, confidence_scores):
    print(f"Token: {token}, Label: {label_map[label.numpy()]}, Confidence: {confidence.numpy():.4f}")


Token: This, Label: Other, Confidence: 0.9941
Token: Deed, Label: Other, Confidence: 1.0000
Token: of, Label: Bank (Lender), Confidence: 0.5996
Token: Trust, Label: Other, Confidence: 0.9987
Token: was, Label: Person, Confidence: 0.5541
Token: made, Label: Other, Confidence: 1.0000
Token: between, Label: Other, Confidence: 0.9981
Token: David, Label: Person, Confidence: 0.6779
Token: Lee, Label: Bank (Lender), Confidence: 0.9985
Token: as, Label: Property Address, Confidence: 0.9134
Token: Trustee, Label: Bank (Lender), Confidence: 0.5764
Token: and, Label: Person, Confidence: 0.7364
Token: Michelle, Label: Bank (Lender), Confidence: 0.9999
Token: King, Label: Bank (Lender), Confidence: 0.9974
Token: as, Label: Other, Confidence: 0.9538
Token: Beneficiary, Label: Other, Confidence: 0.8750
Token: for, Label: Other, Confidence: 0.6891
Token: 789, Label: Other, Confidence: 0.8805
Token: Oak, Label: Property Address, Confidence: 1.0000
Token: Rd, Label: Other, Confidence: 0.9995
Token: wit

<b> Experimentation

In [27]:
import tensorflow as tf
import numpy as np

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Mortgage-related training data
sentences = [
    "In this Mortgage Deed, Bank of America, the Lender, provides a the loan of $500,000 to John Doe, the Borrower, secured by the property at 123 Elm St.",
]

# Corresponding named entity annotations (simplified)
# 0 for 'Other', 1 for 'Person', 2 for Property Address', 3 for 'Bank (Lender)', 4 for Loan Amount", 5 for 'Document Type'
labels = [[0, 0, 5, 5, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0],]

# Tokenization
token_lists = [list(nlp(sentence)) for sentence in sentences]

# Validate token and label lengths
for idx, tokens in enumerate(token_lists):
    assert len(tokens) == len(labels[idx]), f" Mismatch in sentence {idx + 1}, Tokens: {len(tokens)}, Labels: {len(labels[idx])}"

# Convert token vectors to embeddings
embeddings = [np.array([token.vector for token in token_list]) for token_list in token_lists]
embeddings = [tf.convert_to_tensor(emb, dtype=tf.float32) for emb in embeddings]
X_train = tf.concat(embeddings, axis=0)
Y_train = tf.convert_to_tensor([label for sublist in labels for label in sublist], dtype=tf.int32)

class SimpleNERModel(tf.keras.Model):
    def __init__(self, num_labels):
        super(SimpleNERModel, self).__init__()
        self.fc = tf.keras.layers.Dense(num_labels)

    def call(self, inputs):
        return self.fc(inputs)
# Initialize and train the model
input_dim = X_train.shape[1]
num_labels = 6  # 'Other', 'Person', 'Property Address'
model = SimpleNERModel(num_labels)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)  # Increase the learning rate

# Train the model (increased epochs)
num_epochs = 300
for epoch in range(num_epochs):
    with tf.GradientTape() as tape:
        outputs = model(X_train)
        loss = criterion(Y_train, outputs)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Predict named entities and get confidence scores for a new sentence
new_sentence = "A loan agreement is a very complex document that can protect the two parties involved. In most cases the lender creates the loan agreement, which means the burden of including all of the terms for the agreement falls on the lending party. Unless you have created loan agreements before, you will likely want to make sure that you completely understand all of the components so you do not leave out anything that can protect you during the lifetime of the loan. This guide can help you create a solid loan agreement and understand more about the mechanics behind it."
new_doc = nlp(new_sentence)
new_embeddings = tf.convert_to_tensor([token.vector for token in new_doc], dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    predictions = model(new_embeddings)
    softmax_outputs = tf.nn.softmax(predictions, axis=1)
    predicted_classes = tf.argmax(softmax_outputs, axis=1)
    confidence_scores = tf.reduce_max(softmax_outputs, axis=1)

label_map = {0: 'Other', 1: 'Person', 2: 'Property Address', 3:'Bank (Lender)', 4: 'Loan Amount', 5: 'Document Type'}
for token, label, confidence in zip(new_doc, predicted_classes, confidence_scores):
    print(f"Token: {token}, Label: {label_map[label.numpy()]}, Confidence: {confidence.numpy():.4f}")


Token: A, Label: Other, Confidence: 1.0000
Token: loan, Label: Property Address, Confidence: 0.7109
Token: agreement, Label: Other, Confidence: 1.0000
Token: is, Label: Other, Confidence: 0.9907
Token: a, Label: Other, Confidence: 0.9974
Token: very, Label: Document Type, Confidence: 0.9428
Token: complex, Label: Document Type, Confidence: 0.8363
Token: document, Label: Other, Confidence: 1.0000
Token: that, Label: Bank (Lender), Confidence: 0.9790
Token: can, Label: Document Type, Confidence: 0.9171
Token: protect, Label: Other, Confidence: 0.9989
Token: the, Label: Other, Confidence: 1.0000
Token: two, Label: Property Address, Confidence: 0.8184
Token: parties, Label: Other, Confidence: 0.9988
Token: involved, Label: Other, Confidence: 0.9772
Token: ., Label: Person, Confidence: 0.9160
Token: In, Label: Other, Confidence: 1.0000
Token: most, Label: Document Type, Confidence: 1.0000
Token: cases, Label: Property Address, Confidence: 0.4224
Token: the, Label: Other, Confidence: 0.9313
