In [22]:
import pandas as pd
from docx import Document
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
# Initialize the LEGAL-BERT-SMALL model and tokenizer
model_name = "nlpaueb/legal-bert-small-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [5]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [3]:
# Function Summarise/Extract text retaining to royaltyrates

def clean_and_extract_royalty(text):
    """
    Cleans the text and extracts sentences related to 'royalty'.
    
    Parameters:
        text (str): The input legal contract text.
    
    Returns:
        str: Cleaned and extracted text related to 'royalty'.
    """
    
    # Remove newline and other extra characters
    cleaned_text = re.sub(r'[\n\t]', ' ', text)
    
    # Tokenize the document into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', cleaned_text)
    
    # Extract sentences that contain the word 'royalty'
    royalty_related_sentences = [sentence for sentence in sentences if re.search(r'\broyalty\b', sentence, re.I)]
    
    # Combine the sentences to form the summary
    royalty_summary = ' '.join(royalty_related_sentences)
    
    return royalty_summary


In [6]:
file_path = '/home/kprasath/DS/NLPInternship/txt_data/ref_1_txt/11206.txt'
text_1 = read_txt(file_path)


In [7]:
cleaned_and_extracted_text = clean_and_extract_royalty(text_1)
cleaned_and_extracted_text

'The Jury further awarded damages to Tercica and GNE of $7.5 million and a royalty of 15% of IPLEX™ sales through December\xa06, 2006 (the “Damages”);    Whereas, legal proceedings among the Parties (and also Avecia Limited, a company incorporated under the laws of England and Wales, whose registered office is at PO Box 42, Hexagon Tower, Blackley, Manchester M9 8ZS, United Kingdom (“Avecia”)) and relating to European Patent (UK) No. 0 571 417 have been commenced in the United Kingdom in the High Court of Justice under action numbers HC 04 C 03940 and HC 05 C 00415 (the “UK Proceedings”);    Whereas, the Parties have determined that it is in their mutual interest to avoid the expense, distraction, and uncertainty of further litigation and have therefore agreed to conclude and resolve all of their disputes under the Lawsuit and UK Proceedings and settle and consent to judgment of such Lawsuit and UK Proceedings pursuant to the terms and conditions of this Agreement, the Consent Judgment

In [16]:
def clean_and_extract_royalty(text):
    """
    Cleans the text and extracts sentences related to 'royalty' for legal contracts.
    
    Parameters:
        text (str): The input legal contract text.
    
    Returns:
        str: Cleaned and extracted text related to 'royalty'.
    """
    # Remove special characters like non-breaking spaces
    cleaned_text = text.replace('\xa0', ' ')
    cleaned_text = text.replace('\xa07', ' ')
    # Standardize quotation marks
    cleaned_text = cleaned_text.replace("“", '"').replace("”", '"')
    
    # Remove or replace other special characters if necessary
    cleaned_text = cleaned_text.replace("™", " ")  # Example: you can replace ™ with TM if your BERT model can't handle ™
    

    cleaned_text = re.sub(r'[\n\t]', ' ', text)

    # Remove extra white spaces
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    
    # Tokenize the document into sentences using NLTK's Punkt tokenizer
    sentences = sent_tokenize(cleaned_text)
    
    # Extract sentences that contain the word 'royalty' and are likely to be legally relevant
    royalty_related_sentences = [sentence for sentence in sentences if re.search(r'\broyalty\b', sentence, re.I)]
    
    # Combine the sentences to form the summary
    royalty_summary = ' '.join(royalty_related_sentences)
    
    return royalty_summary


In [17]:
cleaned_and_extracted_text = clean_and_extract_royalty(text_1)
cleaned_and_extracted_text

'The Jury further awarded damages to Tercica and GNE of $7.5 million and a royalty of 15% of IPLEX™ sales through December\xa06, 2006 (the “Damages”); Whereas, legal proceedings among the Parties (and also Avecia Limited, a company incorporated under the laws of England and Wales, whose registered office is at PO Box 42, Hexagon Tower, Blackley, Manchester M9 8ZS, United Kingdom (“Avecia”)) and relating to European Patent (UK) No. 1.7 “ALS Royalty” has the meaning assigned to it in Section\xa07.1(a)(iii). Sublicensee shall also include any Third Party who purchases its supply of Product, in finished form from Insmed, its Affiliates or Sublicensee for resale into the market, where, as a partial or full consideration for such purchase, such Third Party has a payment obligation to Insmed, its Affiliates or Sublicensee that is a percentage of its net sales, including without limitation a royalty obligation. Subject to the terms and conditions of this Agreement, including without limitation

## Running BERT

In [18]:
%%time
inputs = tokenizer(cleaned_and_extracted_text, padding=True, truncation=True, return_tensors="pt")


CPU times: user 14.1 ms, sys: 948 µs, total: 15.1 ms
Wall time: 20 ms


In [21]:
%%time
# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state


CPU times: user 981 ms, sys: 10.4 ms, total: 992 ms
Wall time: 143 ms


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a query embedding for "royalty rates"
query_input = tokenizer("royalty rates", return_tensors="pt")
with torch.no_grad():
    query_output = model(**query_input)
    query_embedding = query_output.last_hidden_state.mean(dim=1)

# Calculate similarity scores
similarity_scores = cosine_similarity(embeddings.mean(dim=1), query_embedding)

# Extract top sentences (This is a simplified example; you may need more complex logic)
top_sentences = [sentences[i] for i in similarity_scores.argsort()[::-1][:5]]


NameError: name 'sentences' is not defined