In [20]:
import os
import re
import spacy
import nltk 

# Ensure NLTK tokenization is available
nltk.download("punkt")

# Load spaCy English tokenizer
# !python -m spacy download en_core_web_sm #run the first time
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger"])
# Define input folder containing legal contracts
INPUT_FILE = "concatenated_text.txt"
# INPUT_FILE = "sample.txt"
OUTPUT_FILE = "output.txt"

[nltk_data] Downloading package punkt to /Users/devanshk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
TOKENIZER_TYPE = "spacy"
CHUNK_SIZE = 500_000  # Process 500,000 characters at a time for large text

In [10]:
def read_file_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    """Yields chunks of text from a large file to avoid memory overload."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break
                yield chunk
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None

In [11]:
def regex_tokenizer(text):
    """
    Tokenizes text using a custom regex-based approach.
    - Splits words while keeping punctuation as separate tokens.
    - Handles hyphenated words correctly.
    """
    token_pattern = r"\w+(?:-\w+)*|[.,!?;()\[\]{}:\"\'“”‘’]"
    return re.findall(token_pattern, text)

In [12]:
def regex_tokenizer(text):
    """Tokenizes text using a custom regex-based approach."""
    token_pattern = r"\w+(?:-\w+)*|[.,!?;()\[\]{}:\"\'“”‘’]"
    return re.findall(token_pattern, text)

def nltk_tokenizer(text):
    """Tokenizes text using NLTK's word_tokenize function."""
    return nltk.word_tokenize(text)

def spacy_tokenizer(text_chunks):
    """Processes text chunks using spaCy's tokenizer."""
    tokens = []
    for chunk in text_chunks:
        doc = nlp(chunk)
        tokens.extend(token.text for token in doc)
    return tokens

In [13]:
def tokenize_text(text_chunks, tokenizer_type="spacy"):
    """
    Tokenizes text using the selected tokenizer.
    - For regex & NLTK: Joins chunks and tokenizes once.
    - For spaCy: Processes chunks separately for efficiency.
    """
    if tokenizer_type == "regex":
        return regex_tokenizer(" ".join(text_chunks))
    elif tokenizer_type == "nltk":
        return nltk_tokenizer(" ".join(text_chunks))
    elif tokenizer_type == "spacy":
        return spacy_tokenizer(text_chunks)
    else:
        raise ValueError("Invalid tokenizer type. Use 'regex', 'nltk', or 'spacy'.")


In [14]:
def write_tokens_to_file(tokens, output_file):
    """Writes tokens to an output file, one token per line."""
    with open(output_file, "w", encoding="utf-8") as f:
        for token in tokens:
            f.write(token + "\n")

In [15]:
"""ext of length 26807133 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the nlp.max_length limit. The limit is in number of characters, so you can check whether your inputs are too long by checking len(text).
"""

"ext of length 26807133 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the nlp.max_length limit. The limit is in number of characters, so you can check whether your inputs are too long by checking len(text).\n"

In [16]:
def process_text_file():
    """Reads text in chunks, tokenizes, and writes output."""
    text_chunks = list(read_file_in_chunks(INPUT_FILE))
    if not text_chunks:
        return

    print(f"Processing {len(text_chunks)} chunks...")
    
    tokens = tokenize_text(text_chunks, TOKENIZER_TYPE)
    write_tokens_to_file(tokens, OUTPUT_FILE)

    print(f"Tokenization complete using '{TOKENIZER_TYPE}'. Output written to {OUTPUT_FILE}")
    print(f"Total tokens: {len(tokens):,d}")
    print(f"First 20 tokens: {tokens[:20]}")

In [21]:
if __name__ == "__main__":
    process_text_file()

Processing 54 chunks...
Tokenization complete using 'spacy'. Output written to output.txt
Total tokens: 4,997,689
First 20 tokens: ['CO', '-', 'BRANDING', 'AND', 'ADVERTISING', 'AGREEMENT', '\n\n', 'THIS', 'CO', '-', 'BRANDING', 'AND', 'ADVERTISING', 'AGREEMENT', '(', 'the', '"', 'Agreement', '"', ')']
