# Part 1 - Corpus processing (legal text): tokenization and word counting

## Import Relevant Modules

In [1]:
import word_tokenizer
import zipfile
import os
import shutil
import re
from collections import Counter

## Extract Corpus Data

In [2]:
# Define the path to the zip file and the extraction directory
parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
zip_file_path = os.path.join(parent_path, 'CUAD_v1.zip')
extraction_dir = 'extracted_txt_files'

# Check if the zip file exists
if not os.path.exists(zip_file_path):
    print(f"Error: The file {zip_file_path} does not exist.")
else:
    # Delete the extraction directory if it exists
    if os.path.exists(extraction_dir):
        shutil.rmtree(extraction_dir)

    # Create the extraction directory
    os.makedirs(extraction_dir, exist_ok=True)

    # Extract only .txt files
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file in zip_ref.namelist():
            if file.endswith('.txt'):
                zip_ref.extract(file, extraction_dir)

    # Verify the number of extracted .txt files, excluding README files
    extracted_files = []
    readme_files = []
    for root, dirs, files in os.walk(extraction_dir):
        for file in files:
            if file.endswith('.txt'):
                if "README" in file:
                    readme_files.append(os.path.join(root, file))
                else:
                    extracted_files.append(os.path.join(root, file))
    
    print(f"Number of extracted .txt files (excluding README): {len(extracted_files)}")
    print(f"Number of extracted README files: {len(readme_files)}")
    assert len(extracted_files) == 510, f"Expected 510 text files, but found {len(extracted_files)}"
    assert len(readme_files) == 1, f"Expected 1 README file, but found {len(readme_files)}"

Number of extracted .txt files (excluding README): 510
Number of extracted README files: 1


## Concatenate Files to form a Corpus

In [3]:
corpus = []
tokenizer = word_tokenizer.WordTokenizer()

for file_path in extracted_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus.extend(tokenizer.tokenize(file.read()))

# update the tokenizer with the corpus
tokenizer.corpus = ' '.join(corpus)

# total words
total_words = len(corpus)

print(f"Tokens Found: Corpus created with {total_words} words.")

Tokens Found: Corpus created with 4087261 words.


In [4]:
word_counts: dict = tokenizer.countOccurrences(text=tokenizer.corpus)

# Number of Unique Words
num_types = len(word_counts)
print(f"Number of Types (i.e., unique words): {num_types}")

Number of Types (i.e., unique words): 45597


In [5]:
print(f"Type/Token Ratio: {num_types/total_words}")

Type/Token Ratio: 0.011155881652774315


## Record the Frequency of each Token in the Corpus

In [6]:
# Convert word_counts to a Counter object
word_counts_counter = Counter(word_counts)

# Delete the file if it exists
tokens_file_path = 'tokens.txt'
if os.path.exists(tokens_file_path):
    os.remove(tokens_file_path)

# Write the tokens and their frequencies to tokens.txt
with open(tokens_file_path, 'w', encoding='utf-8') as tokens_file:
    for token, frequency in word_counts_counter.most_common(): # desc order frequency
        tokens_file.write(f"{token}: {frequency}\n")

print("Tokens and their frequencies have been written to tokens.txt")

Tokens and their frequencies have been written to tokens.txt


## Analyze statistics about Tokens in the Corpus

### Extract First 20 Tokens (Words) onto a Text File

In [7]:
print(f"First 20 Sample word counts: {dict(list(word_counts_counter.items())[:20])}")

First 20 Sample word counts: {'CO': 202, 'BRANDING': 53, 'AND': 3761, 'ADVERTISING': 72, 'AGREEMENT': 2055, 'THIS': 1235, 'the': 239999, 'Agreement': 37020, 'is': 21544, 'made': 3865, 'as': 31637, 'of': 151815, 'June': 283, '21': 1275, '1999': 297, 'Effective': 2423, 'Date': 5406, 'by': 42050, 'and': 128998, 'between': 3492}


In [None]:
# Delete the file if it exists
output_file_path = 'output.txt'
if os.path.exists(output_file_path):
    os.remove(output_file_path)

# Write the entire tokenized corpus to output.txt
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write('\n'.join(corpus))

print("Entire tokenized corpus has been written to output.txt")


Entire tokenized corpus has been written to output.txt


### Number of Tokens Appearing Once Only

In [9]:
tokens_once = [word for word, count in word_counts_counter.items() if count == 1]
num_tokens_once = len(tokens_once)

print(f"Number of Tokens Appearing Once Only: {num_tokens_once}")

Number of Tokens Appearing Once Only: 15297


### Extract Only Words (Without Punctuations)

In [10]:
# Define a regular expression pattern to match words
pattern = re.compile(r'\b\w+\b')

# Filter the corpus to include only words
filtered_corpus = [word for word in tokenizer.corpus if pattern.match(word)]

print(f"Filtered Corpus Length (excluding punctuations): {len(filtered_corpus)}")

Filtered Corpus Length (excluding punctuations): 20710584


In [11]:
# List the top 20 most frequent words
word_counts: dict = tokenizer.countOccurrences(text=' '.join(filtered_corpus))
word_counts_counter = Counter(word_counts)
top_20_words = word_counts_counter.most_common(20)
print(f"Top 20 Most Frequent Words: {top_20_words}")

Top 20 Most Frequent Words: [('e', 2386519), ('t', 1838526), ('i', 1492772), ('n', 1476250), ('a', 1454829), ('o', 1436395), ('r', 1372297), ('s', 1135258), ('h', 757883), ('l', 746439), ('c', 690049), ('d', 672764), ('u', 497405), ('m', 446608), ('f', 432631), ('p', 423939), ('y', 343846), ('g', 327485), ('b', 257295), ('v', 185444)]


In [12]:
print(f"Type/Token Ratio: {len(filtered_corpus)/len(tokenizer.corpus)}")

Type/Token Ratio: 0.8345288397539471


### Exclude Stopwords

In [14]:
stopwordsFilePath = 'StopWords.txt'
stopwordsFile = open(stopwordsFilePath, 'r')
stopwords:list = [line.strip() for line in stopwordsFile.readlines()]
print(f"Number of Stopwords considered: {len(stopwords)}")

Number of Stopwords considered: 779


In [15]:
filtered_corpus = [word for word in corpus if word not in stopwords]
print(f"Filtered Corpus Length (excluding stopwords): {len(filtered_corpus)}")

Filtered Corpus Length (excluding stopwords): 2206679


In [16]:
# Top 20 Frequent Words after removing stopwords
word_counts_no_stopwords: dict = tokenizer.countOccurrences(text=' '.join(filtered_corpus))
word_counts_no_stopwords_counter = Counter(word_counts_no_stopwords)
top_20_words_no_stopwords = word_counts_no_stopwords_counter.most_common(20)
print(f"Top 20 Most Frequent Words (excluding stopwords): {top_20_words_no_stopwords}")

Top 20 Most Frequent Words (excluding stopwords): [('Agreement', 37020), ('1', 23056), ('Party', 19216), ('2', 18105), ('3', 14705), ('The', 13566), ('Section', 12406), ('party', 11045), ('4', 10626), ('Company', 9941), ('5', 9434), ('Product', 8852), ('Parties', 7685), ('6', 7362), ('10', 6915), ('set', 6873), ('A', 6779), ('written', 6735), ('8', 6528), ('applicable', 6477)]


In [17]:
print(f"Type/Token Ratio: {len(filtered_corpus)/len(corpus)}")

Type/Token Ratio: 0.5398918738000827


### Bigrams (excluding Punctuations and Stopwords)

In [18]:
# filtered corpus should have no punctuations AND stopwords
filtered_corpus = [word for word in corpus if pattern.match(word) and word not in stopwords]

# Compute bigrams from the filtered corpus
bigrams = [(filtered_corpus[i], filtered_corpus[i+1]) for i in range(len(filtered_corpus)-1)]

print(f"Number of Bigrams: {len(bigrams)}")
print(f"First 20 Bigrams: {bigrams[:20]}")

Number of Bigrams: 2206678
First 20 Bigrams: [('CO', 'BRANDING'), ('BRANDING', 'AND'), ('AND', 'ADVERTISING'), ('ADVERTISING', 'AGREEMENT'), ('AGREEMENT', 'THIS'), ('THIS', 'CO'), ('CO', 'BRANDING'), ('BRANDING', 'AND'), ('AND', 'ADVERTISING'), ('ADVERTISING', 'AGREEMENT'), ('AGREEMENT', 'Agreement'), ('Agreement', 'June'), ('June', '21'), ('21', '1999'), ('1999', 'Effective'), ('Effective', 'Date'), ('Date', 'I'), ('I', 'ESCROW'), ('ESCROW', 'INC'), ('INC', 'principal')]


In [20]:
# Top 20 Frequent Bigrams from the filtered corpus
bigram_strings = [' '.join(bigram) for bigram in bigrams]
bigram_counts = Counter(bigram_strings)
top_20_bigrams = bigram_counts.most_common(20)
print(f"Top 20 Most Frequent Bigrams (excluding punctuations and stopwords): {top_20_bigrams}")

Top 20 Most Frequent Bigrams (excluding punctuations and stopwords): [('Confidential Information', 2869), ('written notice', 2369), ('Effective Date', 2264), ('This Agreement', 2250), ('In event', 2141), ('Third Party', 2012), ('terms conditions', 1902), ('prior written', 1807), ('set Section', 1749), ('1 1', 1682), ('Intellectual Property', 1636), ('2 1', 1499), ('Section 2', 1436), ('written consent', 1323), ('pursuant Section', 1307), ('30 days', 1285), ('United States', 1256), ('U S', 1254), ('2 2', 1240), ('Section 3', 1197)]
