In [1]:
import os 
import regex as re
from bs4 import BeautifulSoup
import PyPDF2
import pandas as pd
from collections import Counter
import logging
import time

# Load a webscrapped dataset
data = pd.read_csv('../Data/proposals_processed.csv')

# Define the folder containing the regulation documents
doc_folder = os.path.abspath(os.path.join(os.getcwd(),'..', 'Data', 'Regulations'))


In [2]:
def extract_section_from_text(text, start_pattern, end_pattern=None, include_start_pattern=True, include_end_pattern=False, exclude_cod=False, case_sensitive=True):
    # Compile the start and end patterns based on case sensitivity switch
    def compile_pattern(pattern, case_sensitive):
        if not case_sensitive:
            return re.compile(pattern, re.IGNORECASE)
        return re.compile(pattern)

    # Compile the start pattern with or without case sensitivity
    start_regex = compile_pattern(start_pattern, case_sensitive)
    start_match = start_regex.search(text)
    if not start_match:
        return ""
    
    start_index = start_match.start() if include_start_pattern else start_match.end()
    
    if end_pattern:
        # Compile the end pattern with or without case sensitivity
        end_regex = compile_pattern(end_pattern, case_sensitive)
        end_match = end_regex.search(text[start_index:])
        if end_match:
            end_index = end_match.end() + start_index if include_end_pattern else end_match.start() + start_index
        else:
            end_index = len(text)  # End at EOF if the end pattern isn't found
    else:
        end_index = len(text)
    
    section_text = text[start_index:end_index].strip()
    
    # Subtract 11 characters (brackets + 9 digits) from the end of Explanatory Memorandum
    if exclude_cod and len(section_text) > 11:
        section_text = section_text[:-11]

    return section_text


In [3]:
# Function to process PDF files
def process_pdf(file_path):
    with open(file_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [4]:
# Function to count words
def word_count(text):
    text = text.replace('\u00A0', ' ')
    words = re.findall(r'\b\w+(?:-\w+)*\b', text)
    word_counts = Counter(words)
    return sum(word_counts.values())

In [5]:
def clean_metadata(text):

    # Define patterns for known metadata (adjust the patterns as needed)
    patterns = [
        r'docVars.*?\]',  # Remove docVars metadata
        r'DQCResult_[\w]+;?\s*\d+;\d+',  # Remove DQCResult related metadata
        r'\[\w+=.*?\]',  # Remove key-value metadata in brackets, e.g. [name=...]
        r'LW_[\w]+;?\s*\d+;\d+',  # Remove LW_ prefixed metadata
        r'DQCWithWarnings.*',  # Remove DQC warnings
        r'DocumentContent.*',  # Remove document content markers
        r'@\w+',  # Remove any usernames or email references (if applicable)
        r'[^\x00-\x7F]',  # Remove non-ASCII characters (including Ã•)
    ]

    # Iterate over the patterns and remove them from the text
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.DOTALL)

    # Clean up extra spaces left after removal of metadata
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [6]:
def process_celex(celex_number, doc_folder, start_chars=100, end_chars=100):
    pdf_file = os.path.join(doc_folder, f"{celex_number}.pdf")
    annex_pdf_file = os.path.join(doc_folder, f"{celex_number}_annex.pdf")  # Adjust annex filename pattern
    html_file = os.path.join(doc_folder, f"{celex_number}.html")

    sections = {}

    # Handling different file types
    if os.path.exists(html_file):
        print(f"Processing HTML file for CELEX {celex_number}")  # Debugging print
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        # Extract the raw text from the HTML document
        text = soup.get_text(separator=' ', strip=True)

        # Clean the metadata from the raw text
        cleaned_text = clean_metadata(text)

        # First check for the presence of 'LEGISLATIVE FINANCIAL STATEMENT' and 'ANNEXES'
        has_financial_statement = 'LEGISLATIVE FINANCIAL STATEMENT' in cleaned_text

        # Search for ANNEXES first, and only if not found, search for ANNEX
        has_annexes = re.search(r'ANNEXES', cleaned_text)
        if has_annexes:
            annexes_start_pattern = 'ANNEXES'
        else:
            has_annexes = re.search(r'ANNEX', cleaned_text)
            if has_annexes:
                annexes_start_pattern = 'ANNEX'

        # Extract preamble
        preamble_start_regex = r'THE EUROPEAN PARLIAMENT AND THE COUNCIL|THE COUNCIL OF THE EUROPEAN UNION'
        preamble_end_regex = r'HAVE ADOPTED|HAS DECIDED|HAVE DECIDED|HAS ADOPTED|HAVE ADOPED|HEREBY RECOMMEND|HA VE ADOPTED|HEREBY INVITE|RECOMMEND that the Member States'
        preamble = extract_section_from_text(cleaned_text, preamble_start_regex, preamble_end_regex)

        # Extract explanatory memorandum
        explanatory_memorandum = extract_section_from_text(cleaned_text, 'EXPLANATORY MEMORANDUM', preamble_start_regex, exclude_cod=True, case_sensitive=False)

        # Adjust the end of articles section based on the presence of financial statement or annexes
        if has_financial_statement:
            articles_end = 'LEGISLATIVE FINANCIAL STATEMENT'
        elif has_annexes:
            articles_end = annexes_start_pattern
        else:
            articles_end = None  # No specific end, capture till the end of the document

        # Extract articles
        articles = extract_section_from_text(cleaned_text, preamble_end_regex, articles_end)

        # Extract financial statement if present
        financial_statement = ""
        if has_financial_statement:
            financial_statement = extract_section_from_text(cleaned_text, 'LEGISLATIVE FINANCIAL STATEMENT', 'ANNEXES', include_start_pattern=True, include_end_pattern=False)

        # Extract annexes if present
        annexes = ""
        if has_annexes:
            annexes = extract_section_from_text(cleaned_text, 'ANNEXES', None, include_start_pattern=True, include_end_pattern=False)

        # Update sections
        sections.update({
            'Explanatory_Memorandum_Word_Count': word_count(explanatory_memorandum),
            'Preamble_Word_Count': word_count(preamble),
            'Articles_Word_Count': word_count(articles),
            'Financial_Statement_Word_Count': word_count(financial_statement),
            'Annexes_Word_Count': word_count(annexes),
            'Explanatory_Memorandum_Text': explanatory_memorandum,
            'Preamble_Text': preamble,
            'Articles_Text': articles,
            'Financial_Statement_Text': financial_statement,
            'Annexes_Text': annexes
        })

     # Handling PDF files (including separate annex file)
    elif os.path.exists(pdf_file):

        print(f"Processing PDF file for CELEX {celex_number}")  # Debugging print
        text = process_pdf(pdf_file)

        # Clean the metadata from the raw text
        cleaned_text = clean_metadata(text)

        # First check for the presence of 'LEGISLATIVE FINANCIAL STATEMENT' and 'ANNEXES'
        has_financial_statement = 'LEGISLATIVE FINANCIAL STATEMENT' in cleaned_text

        # Initialize `has_annexes` to False at the start to avoid referencing before assignment
        has_annexes = False
        
        # If there is a separate annex file
        if os.path.exists(annex_pdf_file):
            # Extract legislative financial statement from the main file
            financial_statement = ""
            if has_financial_statement:
                financial_statement = extract_section_from_text(cleaned_text, 'LEGISLATIVE FINANCIAL STATEMENT', include_start_pattern=True, include_end_pattern=False)
            
            # End articles at the legislative financial statement if it exists
            articles_end = 'LEGISLATIVE FINANCIAL STATEMENT' if has_financial_statement else None
            preamble_start_regex = r'THE EUROPEAN PARLIAMENT AND THE COUNCIL|THE COUNCIL OF THE EUROPEAN UNION'
            preamble_end_regex = r'HAVE ADOPTED|HAS DECIDED|HAVE DECIDED|HAS ADOPTED|HAVE ADOPED|HEREBY RECOMMEND|HA VE ADOPTED|HEREBY INVITE|RECOMMEND that the Member States'
            preamble = extract_section_from_text(cleaned_text, preamble_start_regex, preamble_end_regex)

            explanatory_memorandum = extract_section_from_text(cleaned_text, 'EXPLANATORY MEMORANDUM', preamble_start_regex, exclude_cod=True, case_sensitive=False)
            articles = extract_section_from_text(cleaned_text, preamble_end_regex, articles_end)

            # Process annexes from the annex PDF file
            annexes = process_pdf(annex_pdf_file)

            # If there is no separate annex file
        else:
            # Check for ANNEXES or ANNEX in the main document
            if re.search(r'ANNEXES', cleaned_text):
                has_annexes = True
                annexes_start_pattern = 'ANNEXES'
            elif re.search(r'ANNEX', cleaned_text):
                has_annexes = True
                annexes_start_pattern = 'ANNEX'
            else:
                annexes_start_pattern = None

        # Extract preamble
        preamble_start_regex = r'THE EUROPEAN PARLIAMENT AND THE COUNCIL|THE COUNCIL OF THE EUROPEAN UNION'
        preamble_end_regex = r'HAVE ADOPTED|HAS DECIDED|HAVE DECIDED|HAS ADOPTED|HAVE ADOPED|HEREBY RECOMMEND|HA VE ADOPTED|HEREBY INVITE|RECOMMEND that the Member States'
        preamble = extract_section_from_text(cleaned_text, preamble_start_regex, preamble_end_regex)

        # Extract explanatory memorandum
        explanatory_memorandum = extract_section_from_text(cleaned_text, 'EXPLANATORY MEMORANDUM', preamble_start_regex, exclude_cod=True, case_sensitive=False)

        # Adjust the end of articles section based on the presence of financial statement or annexes
        if has_financial_statement:
            articles_end = 'LEGISLATIVE FINANCIAL STATEMENT'
        elif has_annexes:
            articles_end = annexes_start_pattern
        else:
            articles_end = None  # No specific end, capture till the end of the document

        # Extract articles
        articles = extract_section_from_text(cleaned_text, preamble_end_regex, articles_end)

        # Extract financial statement if present
        financial_statement = ""
        if has_financial_statement:
            financial_statement = extract_section_from_text(cleaned_text, 'LEGISLATIVE FINANCIAL STATEMENT', 'ANNEXES', include_start_pattern=True, include_end_pattern=False)

        # Extract annexes from the main document
        annexes = ""
        if has_annexes:
            annexes = extract_section_from_text(cleaned_text, annexes_start_pattern, None)
            
        # Update sections
        sections.update({
            'Explanatory_Memorandum_Word_Count': word_count(explanatory_memorandum),
            'Preamble_Word_Count': word_count(preamble),
            'Articles_Word_Count': word_count(articles),
            'Financial_Statement_Word_Count': word_count(financial_statement),
            'Annexes_Word_Count': word_count(annexes),
            'Explanatory_Memorandum_Text': explanatory_memorandum,
            'Preamble_Text': preamble,
            'Articles_Text': articles,
            'Financial_Statement_Text': financial_statement,
            'Annexes_Text': annexes
        })

    else:
        print(f"No valid document format found for CELEX {celex_number}")
        return None

    # Modify the output to return the specified number of characters for each section
    for section_key in ['Explanatory_Memorandum_Text', 'Preamble_Text', 'Articles_Text', 'Financial_Statement_Text', 'Annexes_Text']:
        section_text = sections.get(section_key, "")
        if len(section_text) > (start_chars + end_chars):
            sections[section_key] = section_text[:start_chars] + '...' + section_text[-end_chars:]
        else:
            sections[section_key] = section_text  # If the text is shorter, return it as is

    sections['Total_Word_Count'] = sum([sections.get('Explanatory_Memorandum_Word_Count', 0),
                                        sections.get('Preamble_Word_Count', 0),
                                        sections.get('Articles_Word_Count', 0),
                                        sections.get('Financial_Statement_Word_Count', 0),
                                        sections.get('Annexes_Word_Count', 0)])

    return sections

In [7]:
def apply_to_dataframe(data, doc_folder):
    results = []
    total_records = len(data)  

    # Loop through each CELEX number in the DataFrame
    for celex in data['CELEX'][:total_records]:
        try:
            # Initialize a dictionary to hold the results for each CELEX number
            row_result = {}

            # Add the original CELEX and any other columns from the input DataFrame
            row_result['CELEX'] = celex

            # Process the CELEX document based on its available formats
            if os.path.exists(os.path.join(doc_folder, f"{celex}.html")):
                sections = process_celex(celex, doc_folder)
                row_result['Format_Used'] = 'html'
            elif os.path.exists(os.path.join(doc_folder, f"{celex}.pdf")):
                sections = process_celex(celex, doc_folder)
                row_result['Format_Used'] = 'pdf'
            else:
                sections = None
                row_result['Format_Used'] = 'none'
                row_result['Error'] = 'No valid format found'
                logging.warning(f"Failed to process CELEX {celex} due to {row_result['Error']}")
                results.append(row_result)
                continue  # Skip to the next CELEX if no valid document is found

            # If sections were successfully extracted, append the results
            if sections:
                row_result.update(sections)  # Merge the sections dictionary into the row_result
                logging.info(f"Successfully processed CELEX {celex} as {row_result['Format_Used']}")
            else:
                # If sections couldn't be processed, log the failure
                row_result['Total_Word_Count'] = None
                row_result['Explanatory_Memorandum_Word_Count'] = None
                row_result['Preamble_Word_Count'] = None
                row_result['Articles_Word_Count'] = None
                row_result['Annexes_Word_Count'] = None
                row_result['Explanatory_Memorandum_Text'] = ''
                row_result['Preamble_Text'] = ''
                row_result['Articles_Text'] = ''
                row_result['Annexes_Text'] = ''
                logging.warning(f"Sections not found for CELEX {celex}")

            results.append(row_result)

        except Exception as e:
            logging.error(f"Error processing CELEX {celex}: {e}")
            row_result['Error'] = str(e)
            results.append(row_result)
            continue  # Continue processing the next CELEX even if an error occurs

    # Convert the list of results into a DataFrame
    results_df = pd.DataFrame(results)

    # Concatenate the original data with the new results (using CELEX as the key)
    final_data = pd.concat([data.set_index('CELEX'), results_df.set_index('CELEX')], axis=1).reset_index()

    return final_data


In [8]:
#Test 
process_celex("52000PC0126", doc_folder, start_chars= 1000, end_chars= 1000)

Processing HTML file for CELEX 52000PC0126


{'Explanatory_Memorandum_Word_Count': 616,
 'Preamble_Word_Count': 2740,
 'Articles_Word_Count': 0,
 'Financial_Statement_Word_Count': 0,
 'Annexes_Word_Count': 0,
 'Explanatory_Memorandum_Text': 'EXPLANATORY MEMORANDUM 1. In the context of a people\'s Europe, the Commission attaches great importance to simplifying and clarifying Community law so as to make it clearer and more accessible to the ordinary citizen, thus giving him new opportunities and the chance to make use of the specific rights it gives him. This aim cannot be achieved so long as numerous provisions that have been amended several times, often quite substantially, remain scattered, so that they must be sought partly in the original instrument and partly in later amending ones. Considerable research work, comparing many different instruments, is thus needed to identify the current rules. For this reason a codification of rules that have frequently been amended is also essential if Community law is to be clear and transpa

In [9]:
# Start timing
start_time = time.time()

# Apply the process and save the DataFrame to a CSV file
processed_data = apply_to_dataframe(data, doc_folder)

# Save the processed data to a CSV file
processed_data.to_csv('../Data/proposals_word_count_processed.csv', index=False)

# End timing
end_time = time.time()

elapsed_time_seconds = end_time - start_time
minutes = elapsed_time_seconds // 60  # Integer division to get full minutes
seconds = elapsed_time_seconds % 60    # Modulus to get the remainder in seconds

print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")

Processing HTML file for CELEX 52023PC0637
Processing HTML file for CELEX 52023PC0636
Processing HTML file for CELEX 52023PC0770
Processing HTML file for CELEX 52023PC0769
Processing HTML file for CELEX 52023PC0783
Processing HTML file for CELEX 52023PC0779
Processing HTML file for CELEX 52023PC0781
Processing HTML file for CELEX 52023PC0771
Processing HTML file for CELEX 52023PC0752
Processing HTML file for CELEX 52023PC0905
Processing HTML file for CELEX 52023PC0753
Processing HTML file for CELEX 52023PC0777
Processing HTML file for CELEX 52023PC0755
Processing HTML file for CELEX 52023PC0754
Processing HTML file for CELEX 52023PC0738
Processing HTML file for CELEX 52023PC0728
Processing HTML file for CELEX 52023PC0727
Processing HTML file for CELEX 52023PC0733
Processing HTML file for CELEX 52023PC0716
Processing HTML file for CELEX 52023PC0692
Processing HTML file for CELEX 52023PC0702
Processing HTML file for CELEX 52023PC0698
Processing HTML file for CELEX 52023PC0642
Processing 



Processing HTML file for CELEX 52021PC0483
Processing HTML file for CELEX 52021PC0477
Processing HTML file for CELEX 52021PC0434
Processing HTML file for CELEX 52021PC0424
Processing HTML file for CELEX 52021PC0551
Processing PDF file for CELEX 52021PC0556
Processing HTML file for CELEX 52021PC0421
Processing HTML file for CELEX 52021PC0420
Processing HTML file for CELEX 52021PC0423
Processing HTML file for CELEX 52021PC0422
Processing HTML file for CELEX 52021PC0429
Processing HTML file for CELEX 52021PC0564
Processing HTML file for CELEX 52021PC0552
Processing HTML file for CELEX 52021PC0567
Processing HTML file for CELEX 52021PC0568
Processing HTML file for CELEX 52021PC0562
Processing HTML file for CELEX 52021PC0571
Processing HTML file for CELEX 52021PC0558
Processing HTML file for CELEX 52021PC0397
Processing HTML file for CELEX 52021PC0559
Processing HTML file for CELEX 52021PC0557
Processing HTML file for CELEX 52021PC0399
Processing HTML file for CELEX 52021PC0561
Processing H



Processing HTML file for CELEX 52008PC0450
Processing HTML file for CELEX 52008PC0419
Processing HTML file for CELEX 52008PC0390
Processing HTML file for CELEX 52008PC0344
Processing HTML file for CELEX 52008PC0388
Processing HTML file for CELEX 52008PC0351
Processing HTML file for CELEX 52008PC0318
Processing HTML file for CELEX 52008PC0380
Processing HTML file for CELEX 52008PC0369
Processing HTML file for CELEX 52008PC0357
Processing HTML file for CELEX 52008PC0345
Processing HTML file for CELEX 52008PC0316
Processing HTML file for CELEX 52008PC0104
Processing HTML file for CELEX 52008PC0229
Processing HTML file for CELEX 52008PC0213
Processing HTML file for CELEX 52008PC0210
Processing HTML file for CELEX 52008PC0194
Processing HTML file for CELEX 52008PC0202
Processing HTML file for CELEX 52008PC0195
Processing HTML file for CELEX 52008PC0211
Processing HTML file for CELEX 52008PC0179
Processing HTML file for CELEX 52008PC0180
Processing HTML file for CELEX 52008PC0154
Processing 



Processing HTML file for CELEX 52005PC0048
Processing HTML file for CELEX 52005PC0028
Processing HTML file for CELEX 52004PC0835
Processing HTML file for CELEX 52004PC0830
Processing HTML file for CELEX 52004PC0781
Processing HTML file for CELEX 52004PC0775
Processing HTML file for CELEX 52004PC0730
Processing HTML file for CELEX 52004PC0737
Processing HTML file for CELEX 52004PC0725
Processing PDF file for CELEX 52004PC0708
Processing PDF file for CELEX 52004PC0599
Processing PDF file for CELEX 52004PC0718
Processing HTML file for CELEX 52004PC0654
Processing HTML file for CELEX 52004PC0642
Processing HTML file for CELEX 52004PC0448
Processing HTML file for CELEX 52004PC0650
Processing HTML file for CELEX 52004PC0634
Processing HTML file for CELEX 52004PC0638
Processing PDF file for CELEX 52004PC0629
Processing HTML file for CELEX 52004PC0630
Processing HTML file for CELEX 52004PC0628
Processing HTML file for CELEX 52004PC0621
Processing HTML file for CELEX 52004PC0509
Processing HTML



Processing HTML file for CELEX 52000PC0302
Processing HTML file for CELEX 52000PC0260
Processing HTML file for CELEX 52000PC0314
Processing HTML file for CELEX 52000PC0285
Processing HTML file for CELEX 52000PC0368
Processing HTML file for CELEX 52000PC0338
Processing HTML file for CELEX 52000PC0340
Processing HTML file for CELEX 52000PC0279
Processing HTML file for CELEX 52000PC0179
Processing HTML file for CELEX 52000PC0222
Processing HTML file for CELEX 52000PC0186
Processing HTML file for CELEX 52000PC0139
Processing HTML file for CELEX 52000PC0189
Processing HTML file for CELEX 52000PC0121
Processing HTML file for CELEX 52000PC0142(01)
Processing HTML file for CELEX 52000PC0137
Processing HTML file for CELEX 52000PC0162
Processing HTML file for CELEX 52000PC0142(03)
Processing HTML file for CELEX 52000PC0142(02)
Processing HTML file for CELEX 52000PC0080
Processing HTML file for CELEX 52000PC0106
Processing HTML file for CELEX 52000PC0096(02)
Processing HTML file for CELEX 52000PC