# Preprocessing

Testing code to extract text from pdfs, make all lowercase, remove stopwords and punctuation.

In [25]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os
import pdfplumber

In [26]:
def extract_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text content from the page
            page_text = page.extract_text()
            if page_text:
                text += page_text + '\n'  # Append extracted text with a newline
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    #remove puncuation
    pattern1 = r"[^\w\s']"
    pattern2 = '\n'
    text = re.sub(pattern1, '', text)
    text = re.sub(pattern2, ' ', text)

    #tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Convert tokens back to text
    preprocessed_text = ' '.join(filtered_tokens)
    
    return preprocessed_text

def process_pdfs(folder_path, output_folder):
    pdf_files = [f for f in os.listdir(folder_path)[:2] if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        
        # Extract text from PDF using PyPDF2
        extracted_text = extract_text(pdf_path)
        
        # Preprocess extracted text
        preprocessed_text = preprocess_text(extracted_text)
        
        # Save preprocessed text to a new text file
        output_file_path = os.path.join(output_folder, os.path.splitext(pdf_file)[0] + '.txt')
        with open(output_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(preprocessed_text)
        
        print('Preprocessing for doc done')

In [27]:
# Example usage
folder_path = '../data/raw_data'
output_folder = '../data/preprocessed_data'
os.makedirs('../data/preprocessed_data', exist_ok=True)

process_pdfs(folder_path, output_folder)

Preprocessing for doc done
Preprocessing for doc done


In [23]:
text = ''
with pdfplumber.open('/Users/emilykruger/Documents/GitHub/frontex_analysis/data/raw_data/2023.11.16_frontex-general-industry-days-innovation-for-border-and-coast-guard-functions.pdf') as pdf:
    for page in pdf.pages:
        # Extract text content from the page
        page_text = page.extract_text()
        if page_text:
            text += page_text  # Append extracted text with a newline

In [24]:
text

'Frontex General Industry Days: Innovation for\nborder and coast guard functions\n2023-11-16\nJoin us on 6 and 7 December to contribute to innovative solutions for border and coast guard\nfunctions. Frontex’s next general Industry Days will put innovation in the spotlight to reflect the core\nrole of technology in European Integrated Border Management.\nFrontex would like to invite industry representatives to demonstrate how innovation could support\nborder and coast guard functions. Over the course of a two-day programme, 16 industry\nrepresentatives will present their latest approaches, technologies, and solutions (whether already\navailable on the market or under development), which can benefit border management activities at\nthe EU’s external borders and within the EU area, in respect of EU regulations.\nThe first day of the event will have a broad scope, it will be dedicated to innovative solutions in\nsupport of law enforcement activities regarding border management.\nThe second

In [16]:
preprocess_text(text)

'frontex european border coast guard agency wwwfrontexeuropaeu pl europejski 6 00 844 warsaw poland tel 48 22 205 95 00 fax 48 22 205 95 01 frontex general industry days innovation border coast guard functions 2023 1116 join us 6 7 december contribute innovative solutions border coast guard functions frontexs next general industry days put innovation th e spotlight reflect core role technology european integrated border management frontex would like invite industry representatives demonstrate innovation could support border coast guard functions course two day pr ogramme 16 industry representatives present latest approaches technologies solutions whether already available market development benefit border management activities eus external borders within th e eu area respect eu regulations first day event broad scope dedicated innovative solutions support law enforcement activities regarding border management second day focus remote sensing tec hnologies electromagnetic signatures reco