# Part 1 - Corpus processing (legal text): tokenization and word counting

## Import Relevant Modules

In [7]:
import word_tokenizer
import zipfile
import os

## Extract Corpus Data

In [20]:
# Define the path to the zip file and the extraction directory
parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
zip_file_path = os.path.join(parent_path, 'CUAD_v1.zip')
extraction_dir = 'extracted_txt_files'

# Check if the zip file exists
if not os.path.exists(zip_file_path):
    print(f"Error: The file {zip_file_path} does not exist.")
else:
    # Create the extraction directory if it doesn't exist
    os.makedirs(extraction_dir, exist_ok=True)

    # Extract only .txt files
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file in zip_ref.namelist():
            if file.endswith('.txt'):
                zip_ref.extract(file, extraction_dir)

    # Verify the number of extracted .txt files, excluding README files
    extracted_files = []
    readme_files = []
    for root, dirs, files in os.walk(extraction_dir):
        for file in files:
            if file.endswith('.txt'):
                if "README" in file:
                    readme_files.append(os.path.join(root, file))
                else:
                    extracted_files.append(os.path.join(root, file))
    
    print(f"Number of extracted .txt files (excluding README): {len(extracted_files)}")
    print(f"Number of extracted README files: {len(readme_files)}")
    assert len(extracted_files) == 510, f"Expected 510 text files, but found {len(extracted_files)}"
    assert len(readme_files) == 1, f"Expected 1 README file, but found {len(readme_files)}"

Number of extracted .txt files (excluding README): 510
Number of extracted README files: 1
