In [1]:
from datasets import load_dataset
import pdfplumber
from tqdm import tqdm


In [2]:
dataset = load_dataset("TheAtticusProject/cuad")
print(dataset)

Resolving data files:   0%|          | 0/714 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['pdf'],
        num_rows: 511
    })
})


In [3]:
sample = dataset['train'][0].keys()
print(sample) # This prints the text of the contract

dict_keys(['pdf'])


In [4]:
first_pdf = dataset['train'][0]['pdf']
print(type(first_pdf))
print(first_pdf)

<class 'pdfplumber.pdf.PDF'>
<pdfplumber.pdf.PDF object at 0x000001777F5861D0>


In [5]:
all_contract_texts = []

for i in tqdm(range(100), desc="Extracting contracts"):
    pdf_obj = dataset['train'][i]['pdf']  # Get the PDF object for the i-th contract
    contract_text = ""                    # Start with an empty string for this contract
    for page in pdf_obj.pages:            # Loop through each page in the PDF
        page_text = page.extract_text()   # Try to extract text from the page
        if page_text:                     # Only add if there is text
            contract_text += page_text + "\n"
    all_contract_texts.append(contract_text) 

Extracting contracts: 100%|██████████| 100/100 [05:36<00:00,  3.36s/it]


In [15]:
print(all_contract_texts[1]) 
"\n"



Exhibit 10.33
Last Updated: April 6, 2007
CHASE AFFILIATE AGREEMENT
THIS AGREEMENT sets forth the terms and conditions agreed to between Chase Bank USA, N.A. (?Chase?) and you as an “Affiliate” in the Chase
Affiliate Program (the “Affiliate Program”). Once accepted into the Affiliate Program, an Affiliate can establish links from the Affiliate’s Website to
[Chase.com]. Chase will pay Affiliate a fee for each approved credit card account that originates from a link in Affiliate’s Website.
THIS IS A LEGAL AND CONTRACTUALLY BINDING AGREEMENT BETWEEN AFFILIATE AND CHASE. TO APPLY TO THE AFFILIATE
PROGRAM, YOU MUST COMPLETE AND SUBMIT THE AFFILIATE REGISTRATION FORM AND CLICK ON THE “AGREE” BUTTON BELOW
TO INDICATE YOUR WILLINGNESS TO BE BOUND TO CHASE BY THIS AGREEMENT. THIS AGREEMENT WILL TAKE EFFECT IF AND
WHEN CHASE REVIEWS AND ACCEPTS YOUR REGISTRATION FORM AND PROVIDES YOU NOTICE OF ACCEPTANCE. BY SUBMITTING
YOUR REGISTRATION FORM, AFFILIATE CERTIFIES THAT YOU HAVE READ AND UNDERSTAND

'\n'

In [7]:
print(len(all_contract_texts))

100


In [8]:
#Importing the Libraries that will be Helping us Preprocess the Dataset
import string 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishaan.narayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishaan.narayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ishaan.narayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
#Creating Objects to work on with 
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
preprocessed_Data = []

for text in all_contract_texts:
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation))
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_Data.append(tokens)
  

In [12]:
print(preprocessed_Data[0])

['datasheet', 'contract', 'understanding', 'atticus', 'dataset', 'cuad', 'imotivation', 'created', 'dataset', 'eg', 'team', 'research', 'group', 'behalf', 'entity', 'eg', 'company', 'institution', 'organization', 'atticus', 'project', 'nonprofit', 'organization', 'whose', 'mission', 'harness', 'power', 'ai', 'accelerate', 'accurate', 'efficient', 'contract', 'review', 'atticus', 'project', 'started', 'grassroots', 'movement', 'experienced', 'lawyer', 'public', 'company', 'leading', 'law', 'firm', 'aiming', 'achieve', 'highquality', 'lowcost', 'accurate', 'timely', 'contract', 'review', 'using', 'ai', 'officially', 'incorporated', 'california', 'nonprofit', 'public', 'benefit', 'corporation', 'january', '2020', 'b', 'fund', 'associated', 'grant', 'please', 'provide', 'name', 'grantor', 'grant', 'name', 'number', 'atticus', 'project', 'relies', '100', 'unpaid', 'volunteer', 'organized', 'around', 'single', 'mission', 'changing', 'legal', 'industry', 'leveraging', 'ai', 'c', 'purpose', 'd

In [13]:
import pickle

with open("preprocessed_contracts.pkl", "wb") as f:
    pickle.dump(preprocessed_Data, f)