# Preprocessing

Testing code to extract text from pdfs, make all lowercase, remove stopwords and punctuation.

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os
import pdfplumber

In [2]:
def extract_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text content from the page
            page_text = page.extract_text()
            if page_text:
                text += page_text + '\n'  # Append extracted text with a newline
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    #remove puncuation
    pattern1 = r"[^\w\s']"
    pattern2 = '\n'
    text = re.sub(pattern1, '', text)
    text = re.sub(pattern2, ' ', text)

    #tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Convert tokens back to text
    preprocessed_text = ' '.join(filtered_tokens)
    
    return preprocessed_text

def process_pdfs(folder_path, output_folder):
    pdf_files = [f for f in os.listdir(folder_path)[:2] if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        
        # Extract text from PDF using PyPDF2
        extracted_text = extract_text(pdf_path)
        
        # Preprocess extracted text
        preprocessed_text = preprocess_text(extracted_text)
        
        # Save preprocessed text to a new text file
        output_file_path = os.path.join(output_folder, os.path.splitext(pdf_file)[0] + '.txt')
        with open(output_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(preprocessed_text)
        
        print('Preprocessing for doc done')

In [3]:
txt = extract_text('../data/raw_data/annual_risk_analysis_2011.pdf')
txt

'(cid:36)(cid:81)(cid:81)(cid:88)(cid:68)(cid:79)(cid:3)(cid:53)(cid:76)(cid:86)(cid:78)(cid:3)(cid:36)(cid:81)(cid:68)(cid:79)(cid:92)(cid:86)(cid:76)(cid:86)(cid:3)\n(cid:21)(cid:19)(cid:20)(cid:20)(cid:3)\n(cid:40)(cid:88)(cid:85)(cid:82)(cid:83)(cid:72)(cid:68)(cid:81)(cid:3)(cid:36)(cid:74)(cid:72)(cid:81)(cid:70)(cid:92)(cid:3)(cid:73)(cid:82)(cid:85)(cid:3)(cid:87)(cid:75)(cid:72)(cid:3)(cid:48)(cid:68)(cid:81)(cid:68)(cid:74)(cid:72)(cid:80)(cid:72)(cid:81)(cid:87)(cid:3)(cid:82)(cid:73)(cid:3)(cid:50)(cid:83)(cid:72)(cid:85)(cid:68)(cid:87)(cid:76)(cid:82)(cid:81)(cid:68)(cid:79)(cid:3)(cid:38)(cid:82)(cid:82)(cid:83)(cid:72)(cid:85)(cid:68)(cid:87)(cid:76)(cid:82)(cid:81)(cid:3)(cid:68)(cid:87)(cid:3)(cid:87)(cid:75)(cid:72)(cid:3)(cid:40)(cid:91)(cid:87)(cid:72)(cid:85)(cid:81)(cid:68)(cid:79)(cid:3)(cid:37)(cid:82)(cid:85)(cid:71)(cid:72)(cid:85)(cid:86)(cid:3)(cid:82)(cid:73)(cid:3)(cid:87)(cid:75)(cid:72)(cid:3)(cid:48)(cid:72)(cid:80)(cid:69)(cid:72)(cid:85)(cid:3)\n(cid

In [27]:
# Example usage
folder_path = '../data/raw_data'
output_folder = '../data/preprocessed_data'
os.makedirs('../data/preprocessed_data', exist_ok=True)

process_pdfs(folder_path, output_folder)

Preprocessing for doc done
Preprocessing for doc done


In [23]:
text = ''
with pdfplumber.open('/Users/emilykruger/Documents/GitHub/frontex_analysis/data/raw_data/2023.11.16_frontex-general-industry-days-innovation-for-border-and-coast-guard-functions.pdf') as pdf:
    for page in pdf.pages:
        # Extract text content from the page
        page_text = page.extract_text()
        if page_text:
            text += page_text  # Append extracted text with a newline

In [24]:
text

'Frontex General Industry Days: Innovation for\nborder and coast guard functions\n2023-11-16\nJoin us on 6 and 7 December to contribute to innovative solutions for border and coast guard\nfunctions. Frontex’s next general Industry Days will put innovation in the spotlight to reflect the core\nrole of technology in European Integrated Border Management.\nFrontex would like to invite industry representatives to demonstrate how innovation could support\nborder and coast guard functions. Over the course of a two-day programme, 16 industry\nrepresentatives will present their latest approaches, technologies, and solutions (whether already\navailable on the market or under development), which can benefit border management activities at\nthe EU’s external borders and within the EU area, in respect of EU regulations.\nThe first day of the event will have a broad scope, it will be dedicated to innovative solutions in\nsupport of law enforcement activities regarding border management.\nThe second

In [16]:
preprocess_text(text)

'frontex european border coast guard agency wwwfrontexeuropaeu pl europejski 6 00 844 warsaw poland tel 48 22 205 95 00 fax 48 22 205 95 01 frontex general industry days innovation border coast guard functions 2023 1116 join us 6 7 december contribute innovative solutions border coast guard functions frontexs next general industry days put innovation th e spotlight reflect core role technology european integrated border management frontex would like invite industry representatives demonstrate innovation could support border coast guard functions course two day pr ogramme 16 industry representatives present latest approaches technologies solutions whether already available market development benefit border management activities eus external borders within th e eu area respect eu regulations first day event broad scope dedicated innovative solutions support law enforcement activities regarding border management second day focus remote sensing tec hnologies electromagnetic signatures reco

# NLP Test

In [28]:
import pandas as pd
from nltk.tokenize import word_tokenize

## Create Corupus

In [29]:
# #create corupus
# def create_corpus(folder_path):
#     files = os.listdir(folder_path)
#     corpus = []

#     for file in files[:2]:
#         if file.endswith('.txt'):
#             with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
#                 text = f.read()
#                 corpus.append((file, text))  # Store file name and text content as tuple
    
#     return corpus


# corpus = create_corpus(folder_path)

In [50]:
def load_and_tokenize_documents(folder_path):
    tokenized_corpus = []

    files = os.listdir(folder_path)
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
                tokenized_corpus.append(tokens)
    
    return tokenized_corpus

folder_path = '../data/preprocessed_data'
corpus = load_and_tokenize_documents(folder_path)

In [67]:
len(corpus)

104

## TF_IDF

In [78]:
#tf-idf analysis
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf(tokenized_corpus, folder_path):
    # Prepare documents and file names from the tokenized corpus
    documents = [' '.join(doc) for doc in tokenized_corpus]
    file_names = [filename for filename in os.listdir(folder_path) if filename.endswith('.txt')]

    
    # Calculate TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    print(type(tfidf_matrix))
    print(tfidf_matrix.shape)
    feature_names = vectorizer.get_feature_names_out()

    # Create a list of dictionaries containing filename and top 5 TF-IDF words
    result = []
    for i, filename in enumerate(file_names):
        tfidf_scores = list(zip(feature_names, tfidf_matrix[i].toarray().flatten()))
        tfidf_scores.sort(key=lambda x: x[1], reverse=True)
        top_tfidf_words = [word for word, score in tfidf_scores]
        result.append({'Filename': filename, 'Top 5 TF-IDF Words': top_tfidf_words})

    return result


# Calculate TF-IDF matrix, feature names, and file names
result = calculate_tfidf(corpus, output_folder)

# Create DataFrame from the result list of dictionaries
results = pd.DataFrame(result)

<class 'scipy.sparse._csr.csr_matrix'>
(104, 32460)


In [77]:
os.listdir(folder_path)

['fran_q1_2011.txt',
 'afic_2017.txt',
 '2023.11.16_frontex-general-industry-days-innovation-for-border-and-coast-guard-functions.txt',
 'ara-2022-public-web.txt',
 'afic_2016.txt',
 'fran_q1_2010.txt',
 'fran_q1_2012.txt',
 'fran_q2_2015_final.txt',
 'fran_q1_2013.txt',
 'fran_q1_2017.txt',
 '.DS_Store',
 'wb_q2_2018.txt',
 'wb_q3_2018.txt',
 'fran_q1_2014.txt',
 'risk_analysis_for_2019ws.txt',
 'eap-ran_q3_2016.txt',
 'eap-ran_q2_2017.txt',
 'fran_q1_2015.txt',
 'eb_ara_2014.txt',
 'wb_ara_2012.txt',
 'situational_overview_on_trafficking_in_human_beings.txt',
 'wb_ara_2013.txt',
 'eb_ara_2015.txt',
 'wb_q4_2018.txt',
 'annual_risk_analysis_2017ws.txt',
 'eap-ran_q4_2016.txt',
 'wb_ara_2011.txt',
 'fran_q2_2012_.txt',
 'wb_ara_2010.txt',
 'eap-ran_q1_2017-1.txt',
 'wb_q1_2015.txt',
 'eap-ran__q4_2017-1.txt',
 'risk_analysis_for_2018ws.txt',
 'wb_ara_2014.txt',
 'eap-ran_q1_2018.txt',
 'wb_ara_2015.txt',
 'eb_ara_2013.txt',
 'afic_report_2015.txt',
 'wb_q1_2016.txt',
 'fran_q2_2018.txt

In [45]:
results

Unnamed: 0,Filename,Top 5 TF-IDF Words
0,fran_q1_2011.txt,"[2011, q1, detections, eu, 2010]"
1,afic_2017.txt,"[afic, 2017, niger, libya, migrants]"


## Sentiment Analysis

In [47]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download NLTK resources (if not already downloaded)
nltk.download('vader_lexicon')

def analyze_sentiment(text):
    if isinstance(text, list):
        text = ' '.join(text)
    # Analyze sentiment using NLTK Vader
    sid = SentimentIntensityAnalyzer()
    sentiment_score = sid.polarity_scores(text)['compound']
    return sentiment_score

# Example usage (for each document in the corpus):
sentiment_scores = [analyze_sentiment(text) for text in corpus]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/emilykruger/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [49]:
len(corpus)

104

In [52]:
results['sentiment score'] = sentiment_scores[:2]

In [55]:
max(sentiment_scores)

0.9999

## Word2Vec

In [62]:
from gensim.models import Word2Vec

In [63]:
model = Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=1, sg=1, epochs=10)

In [65]:
# Find similar words to a keyword using the fine-tuned model
similar_words_migration = model.wv.most_similar('migration', topn=5)
print("Similar words to 'migration':", similar_words_migration)
# Find similar words to a keyword using the fine-tuned model
similar_words_migrat = model.wv.most_similar('migrant', topn=5)
print("Similar words to 'migrat':", similar_words_migrat)
# Find similar words to a keyword using the fine-tuned model
similar_words_refugee = model.wv.most_similar('refugee', topn=5)
print("Similar words to 'refugee':", similar_words_refugee)

Similar words to 'migration': [('irregular', 0.6413043141365051), ('migra', 0.5688313841819763), ('migratory', 0.556184709072113), ('gration', 0.5402964949607849), ('irregu', 0.5318630337715149)]
Similar words to 'migrat': [('overcrowded', 0.6400548815727234), ('room', 0.6280505061149597), ('rently', 0.6272342205047607), ('boss', 0.6161612868309021), ('parallel', 0.6138434410095215)]
Similar words to 'refugee': [('automatic', 0.6764104962348938), ('ognised', 0.6716601252555847), ('transregional', 0.6708460450172424), ('70migrants', 0.6699479222297668), ('strictions', 0.6603562235832214)]


In [66]:
model.save('../model/word2vec_model.model')