# Legal Document Analyze
- Reading PDF, DOCX, and TXT contract files
- Cleaning and segmenting the extracted text
- Preparing data for NLP tasks like clause extraction

In [12]:
# Install required libraries
#!pip install pypdf python-docx

In [1]:
# Import libraries
import os
import re
import docx
import PyPDF2

import spacy
from collections import Counter



In [2]:
file_path = '/Users/jeanstibel/Documents/AI Classes/Projects/03 Project/Resources/SampleContract-Shuttle.pdf'

In [3]:
# Extract text from PDF, DOCX, or TXT
def extract_text(file_path):
    if file_path.endswith('.pdf'):
        reader = PyPDF2.PdfReader(file_path)
        return "\n".join([page.extract_text() for page in reader.pages])
    elif file_path.endswith('.docx'):
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError("Unsupported file format")

In [4]:
print(extract_text(file_path))

 
Page 1 Sample Contract 
 
Contract No.___________ 
PROFESSIONAL SERVICES AGREEMENT  
 
 THIS AGREEMENT made and entered into this _______day of                       , 20    
 by and between the SANTA 
CRUZ COUNTY REGIONAL TRANSPORTATION COMMISSION, hereinafter called COMMISSION, and ________    
____, hereinafter called CONSULTANT for __________________ (services/project name).    
 
1. DUTIES .  
A. CONSULTANT agrees to exercise special skill to  accomplish the following results in a manner 
reasonably satisfactory to COMMISSION: ______________________________, as specified in Exhibit 
A: Scope of Services , which by this reference is incorporated herein. 
 
B. CONSULTANT shall provide the personnel listed belo w to perform the above-specified services, which 
persons are hereby designated as key personnel under this Agreement.   
 Name     F i r m     F u n c t i o n  
         P r i n c i p a l  i n  C h a r g e  
         P r o j e c t  M a n a g e r  
 
C. No person named in pa

In [5]:
contract_text = extract_text(file_path)
contract_text

' \nPage 1 Sample Contract \n \nContract No.___________ \nPROFESSIONAL SERVICES AGREEMENT  \n \n THIS AGREEMENT made and entered into this _______day of                       , 20    \n by and between the SANTA \nCRUZ COUNTY REGIONAL TRANSPORTATION COMMISSION, hereinafter called COMMISSION, and ________    \n____, hereinafter called CONSULTANT for __________________ (services/project name).    \n \n1. DUTIES .  \nA. CONSULTANT agrees to exercise special skill to  accomplish the following results in a manner \nreasonably satisfactory to COMMISSION: ______________________________, as specified in Exhibit \nA: Scope of Services , which by this reference is incorporated herein. \n \nB. CONSULTANT shall provide the personnel listed belo w to perform the above-specified services, which \npersons are hereby designated as key personnel under this Agreement.   \n Name     F i r m     F u n c t i o n  \n         P r i n c i p a l  i n  C h a r g e  \n         P r o j e c t  M a n a g e r  \n \nC

In [14]:
# Remove numbers and non-alphabetic characters from the document text
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', contract_text)


print(cleaned_text)

 
Page  Sample Contract 
 
Contract No 
PROFESSIONAL SERVICES AGREEMENT  
 
 THIS AGREEMENT made and entered into this day of                            
 by and between the SANTA 
CRUZ COUNTY REGIONAL TRANSPORTATION COMMISSION hereinafter called COMMISSION and     
 hereinafter called CONSULTANT for  servicesproject name    
 
 DUTIES   
A CONSULTANT agrees to exercise special skill to  accomplish the following results in a manner 
reasonably satisfactory to COMMISSION  as specified in Exhibit 
A Scope of Services  which by this reference is incorporated herein 
 
B CONSULTANT shall provide the personnel listed belo w to perform the abovespecified services which 
persons are hereby designated as key personnel under this Agreement   
 Name     F i r m     F u n c t i o n  
         P r i n c i p a l  i n  C h a r g e  
         P r o j e c t  M a n a g e r  
 
C No person named in paragraph B of th is Section or his or her successor shall be removed or replaced by 
CONSULTANT nor shall

In [15]:
# Load the small English language model for spacy
nlp = spacy.load("en_core_web_sm")

In [16]:
from spacy import displacy

# Analyze the article with spacy
doc = nlp(contract_text)

# Render NER visualization with displacy to determine entities for extraction
displacy.render(doc, style='ent')

In [17]:
import pandas as pd

# Extract entities from the doc object
entities = [ent.text for ent in doc.ents]

# Create a variable, most_freq_entities, that stores the most frequent entities 
# using the most_common() function from the Counter module.
most_freq_entities = Counter(entities).most_common()

# Print the first 10 most frequent entities
print(most_freq_entities[:10])

# Use list comprehensions to retrieve each entity and the number of occurrences for each entity in separate lists.
entity = [most_freq_entities[i][0] for i, _ in enumerate(most_freq_entities)]
frequency = [most_freq_entities[i][1] for i, _ in enumerate(most_freq_entities)]

# Create a DataFrame that has columns to hold each entity and the number of times each entity appears.
common_entities_df = pd.DataFrame(
    {
        'entity':entity,
        'frequency':frequency
    }
)

# Sort the DataFrame
common_entities_df.sort_values(by=['frequency'], ascending=False).reset_index(drop=True)

# Display the first ten rows. 
common_entities_df.head(10)

[('COMMISSION', 60), ('Agreement', 18), ('2', 6), ('1', 5), ('3', 5), ('1523', 4), ('Santa Cruz', 4), ('4', 4), ('CONSULTANT', 4), ('third', 4)]


Unnamed: 0,entity,frequency
0,COMMISSION,60
1,Agreement,18
2,2,6
3,1,5
4,3,5
5,1523,4
6,Santa Cruz,4
7,4,4
8,CONSULTANT,4
9,third,4


In [50]:
# Display the last ten rows. 
common_entities_df.tail(10)

Unnamed: 0,entity,frequency
133,NONASSIGNMENT,1
134,22,1
135,KICKBACKS OR,1
136,NAME,1
137,Luis Mendez,1
138,CA 95060 \n24,1
139,COMPLETE AGREEMENT\n \nA. AGREEMENT :,1
140,C. COMPLETE,1
141,SANTA CRUZ,1
142,RTC Fiscal & Contract,1


In [18]:
# Use the separate_punc function to remove the puncutation. 
def separate_punc(holmes_text):
    """
    Retrieves all the words in the text without punctuation 

    Args:
        text (str): The input text from which words are extracted witout punctuation.

    Returns:
        list: A list of words

    """
    # Create a list comprehension to get only the tokens, i.e., words in the text.
    return [token.text.lower() for token in nlp(holmes_text) \
            if token.text not in '\n\n \n\n\n!"“”-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [19]:
# Clean and tokenize the text using the separate_punc function.
tokens = separate_punc(contract_text)

In [20]:
# Get the length of the tokens list.
len(tokens)

5086

In [21]:
# Look over the tokems to make sure all theo punctuation has been reomved.  
# Some punctuation wasn't removed
print(tokens[:300])

['page', '1', 'sample', 'contract', 'contract', 'no', 'professional', 'services', 'agreement', ' \n \n ', 'this', 'agreement', 'made', 'and', 'entered', 'into', 'this', 'day', 'of', '                      ', '20', '   \n ', 'by', 'and', 'between', 'the', 'santa', 'cruz', 'county', 'regional', 'transportation', 'commission', 'hereinafter', 'called', 'commission', 'and', '   \n', 'hereinafter', 'called', 'consultant', 'for', 'services', 'project', 'name', '   \n \n', '1', 'duties', 'a.', 'consultant', 'agrees', 'to', 'exercise', 'special', 'skill', 'to', 'accomplish', 'the', 'following', 'results', 'in', 'a', 'manner', 'reasonably', 'satisfactory', 'to', 'commission', 'as', 'specified', 'in', 'exhibit', 'a', 'scope', 'of', 'services', 'which', 'by', 'this', 'reference', 'is', 'incorporated', 'herein', 'b.', 'consultant', 'shall', 'provide', 'the', 'personnel', 'listed', 'belo', 'w', 'to', 'perform', 'the', 'above', 'specified', 'services', 'which', 'persons', 'are', 'hereby', 'designated