# Text analysis and RAG for EU document analysis

The following code was used to understand the role of EU's documents in either facilitating or hindering the role of data exchange in the defence sector

# 1. RAG 

For deveoping the RAG a Youtube video by Thu Vu was taken as the base code which was modified to fit the current context. https://www.youtube.com/watch?v=EFUE4DHiAPM&list=LL&index=1&t=1875s

In [91]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv
from collections import defaultdict


In [92]:
load_dotenv()

True

In [93]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 

## Define the LLM

In [94]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

## 1.1 Process PDF documents

### Load all PDF documents



In [134]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])

In [135]:
# Load and split documents
all_chunks = []
chunks_by_doc = defaultdict(list)

pdf_dir = "data"
for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_dir, filename)
        loader = PyPDFLoader(file_path)
        pages = loader.load()
        chunks = text_splitter.split_documents(pages)
        all_chunks.extend(chunks)
        chunks_by_doc[filename] = chunks

In [136]:
# Count total pages
total_pages = sum(len(PyPDFLoader(os.path.join(pdf_dir, f)).load()) 
                  for f in os.listdir(pdf_dir) if f.endswith(".pdf"))

# Count total chunks
total_chunks = len(all_chunks)

print(f"Total pages: {total_pages}")
print(f"Total chunks: {total_chunks}")


Total pages: 1818
Total chunks: 8547


### Create embeddings

In [137]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()

In [138]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

### Create vector database

In [139]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    
    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore


In [140]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_chroma")

## 1.2. QUERY FOR RELEVANT DATA

In [141]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [142]:
print(f"Number of documents in vectorstore: {len(vectorstore.get()['documents'])}")

Number of documents in vectorstore: 1916


In [143]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("Is data exchange in defence mentioned in the document?")
relevant_chunks


[Document(metadata={'creationdate': '2025-03-19T11:46:08+01:00', 'creator': 'PyPDF', 'moddate': '2025-03-19T11:46:08+01:00', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_actionid': '3eba1a90-0142-4259-9e27-9a4503ee2670', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_contentbits': '0', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_enabled': 'true', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_method': 'Standard', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_name': 'Commission Use', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_setdate': '2025-03-17T15:45:54Z', 'msip_label_6bd9ddd1-4d20-43f6-abfa-fc3c07406f94_siteid': 'b24c8b06-522c-46fe-9080-70926f8dddb1', 'page': 20, 'page_label': '21', 'producer': 'PyPDF', 'source': 'data/29-european-defence-readiness2030.pdf', 'total_pages': 23}, page_content='20 \non the deep and extensive transatlantic supply chain, which should be mutually beneficial. The \nbilateral dialogue on Security and Defence can be enhanced to further 

In [144]:
for doc in relevant_chunks:
    print(doc.page_content[:1000])  # Preview first 500 chars

20 
on the deep and extensive transatlantic supply chain, which should be mutually beneficial. The 
bilateral dialogue on Security and Defence can be enhanced to further strengthen cooperation 
in fields such as cyber, maritime security and space, discuss procurement issues and tackle any 
other matters of mutual concern. 
The United Kingdom is an essential European ally with which cooperation on security and 
defence should be enhanced in mutual interest, starting with a potential Security and Defence 
partnership. Building on the set of solid agreements in place, bilateral security and defenc e 
cooperation can expand, ranging from external crisis management to defence industrial 
policies.  
Norway is a full partner in EU defence programmes through its contribution to the EU budget. 
The recently launched Security and Defence Partnership provides a comprehensive and 
structured political framework for strengthening further dialogue and cooperation.
significant effects in the area of

## 1.3. Generating answers

In [145]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for structured question answering based on retrieved document context.

Given the following context, answer the question clearly, and provide:
- the direct source text used
- your reasoning for the answer

{context}

---

Respond in the following JSON structure:
{{
  "answer": "...",
  "sources": "...",
  "reasoning": "..."
}}

Question: {question}
"""

In [150]:
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

# Pydantic model for output
class AnswerWithSources(BaseModel):
    answer: str = Field(description="Answer to the question")
    sources: str = Field(description="Exact source text and location used from the context")
    reasoning: str = Field(description="Explanation of how the answer is supported by the context")

# Format context
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Questions
questions = [
    "Is the document applicable to data exchange in defence sector or is this out of scope for this document?"
]


In [151]:
# Run RAG pipeline per document
rows = []
for doc_name, chunks in chunks_by_doc.items():
    retriever = create_vectorstore(
        chunks, embedding_function, f"vectorstore_chroma/{doc_name}"
    ).as_retriever()

    row = {"document": doc_name}
    for i, question in enumerate(questions, 1):
        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(AnswerWithSources, strict=True)
        )
        try:
            result = rag_chain.invoke(question)
            row[f"Q{i}_Answer"] = result.answer
            row[f"Q{i}_Source"] = result.sources
            row[f"Q{i}_Reasoning"] = result.reasoning
        except Exception as e:
            row[f"Q{i}_Answer"] = "Error"
            row[f"Q{i}_Source"] = ""
            row[f"Q{i}_Reasoning"] = str(e)
    rows.append(row)



In [152]:
# Build final table
df = pd.DataFrame(rows)
df.set_index("document", inplace=True)
df


Unnamed: 0_level_0,Q1_Answer,Q1_Source,Q1_Reasoning
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2-shaping-europes-digital-future.pdf,"Yes, the document is applicable to data exchan...","""building synergies between civilian cyber res...",The text explicitly mentions the need to build...
15-EU-ideas-data-driven-economy.pdf,The document is primarily focused on the devel...,The document discusses 'facilitating the use o...,The explicit focus on areas such as smart grid...
23-european-single-access-point.pdf,The document does not specifically indicate ap...,"""validations should not relate to the content ...",The text discusses the structures and responsi...
19-digital-decade-policy-programme-2030.pdf,The document is out of scope for data exchange...,"""it is necessary for the Union to identify sys...",The content of the document focuses on digital...
7-edf-work-programme-2025.pdf,"Yes, the document is applicable to data exchan...",Activities that aim to increase interoperabili...,The context explicitly mentions activities tha...
1-european-strategy-for-data.pdf,The document is out of scope for data exchange...,The context discusses data interoperability is...,The document focuses on sectors like health an...
8-edf-2025-call-topic-descriptions.pdf,"Yes, the document is applicable to data exchan...",Activities that aim to increase interoperabili...,The context explicitly states that activities ...
30-SAFE-regulation.pdf,The document is applicable to data exchange in...,"""The proposed Regulation is an emergency measu...",The document explicitly discusses financial as...
13-EDIRPA-implementing-decision.pdf,The document is applicable to data exchange in...,Those funding priorities shall aim to ensure t...,The context discusses funding priorities and t...
17-digital-markets-act.pdf,The document does not specifically mention app...,"""This Regulation respects the fundamental righ...",The context discusses the general principles a...


In [153]:
import openpyxl
df.to_excel("defence_data_exchange_analysis2.xlsx")

# 2. TEXT ANALYSIS

In [None]:
#import necessary libraries

import fitz
import spacy
from textacy import text_stats
import pandas as pd
import os
from PyPDF2 import PdfReader
from collections import defaultdict
import re

# Load spaCy language model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to get most common n-grams
def get_top_ngrams(doc, ngram_size=2, top_n=10):
    ngrams = extract.ngrams(doc, n=ngram_size, filter_stops=True, filter_punct=True, min_freq=2)
    ngram_freq = Counter([str(ngram) for ngram in ngrams])
    return ngram_freq.most_common(top_n)

# Function to extract key terms using TextRank
def extract_key_terms(doc, top_n=10):
    key_terms = extract.keyterms.textrank(doc, topn=top_n)
    return [term for term, score in key_terms]

In [129]:
pdf_dir = "data"
results = []

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_dir, filename)
        
        try:
            # Extract text
            text = extract_text_from_pdf(file_path)
            
            # Process text with spaCy
            doc = nlp(text)

            # Compute metrics
            num_sents = text_stats.basics.n_sents(doc)
            num_words = text_stats.basics.n_words(doc)
            num_unique_words = text_stats.basics.n_unique_words(doc)
            lexical_density = text_stats.diversity.ttr(doc)
            readability_score = text_stats.readability.flesch_reading_ease(doc)

            # Extract top bigrams and trigrams
            top_bigrams = get_top_ngrams(doc, ngram_size=2)
            top_trigrams = get_top_ngrams(doc, ngram_size=3)

            # Extract key terms
            key_terms = extract_key_terms(doc)

            # Append results
            results.append({
                'Filename': filename,
                'Number of Sentences': num_sents,
                'Number of Words': num_words,
                'Number of Unique Words': num_unique_words,
                'Lexical Density (TTR)': lexical_density,
                'Flesch Reading Ease Score': readability_score,
                'Top 10 Bigrams': top_bigrams,
                'Top 10 Trigrams': top_trigrams,
                'Top 10 Key Terms': key_terms
            })

            print(f"Processed: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Create DataFrame and export to Excel
df = pd.DataFrame(results)
df.to_excel("output.xlsx", index=False)

print("All documents processed. Results saved to 'output.xlsx'.")

Processed: 2-shaping-europes-digital-future.pdf
Processed: 15-EU-ideas-data-driven-economy.pdf
Processed: 23-european-single-access-point.pdf
Processed: 19-digital-decade-policy-programme-2030.pdf
Processed: 7-edf-work-programme-2025.pdf
Processed: 1-european-strategy-for-data.pdf
Processed: 8-edf-2025-call-topic-descriptions.pdf
Processed: 30-SAFE-regulation.pdf
Processed: 13-EDIRPA-implementing-decision.pdf
Processed: 17-digital-markets-act.pdf
Processed: 28-free-flow-of-nonpersonal-data.pdf
Processed: 18-digital-europe-programme.pdf
Processed: 11EDIP.pdf
Processed: 20-chips-act.pdf
Processed: 27-defence-investment-gap-analysis.pdf
Processed: 6-european-defence-industrial-strategy.pdf
Processed: 10-AI-act.pdf
Processed: 26-ASAP-implementation-report.pdf
Processed: 31-accommodating-defence-expenditure.pdf
Processed: 9-edf-indicative-multiannual-perspective-2025-2027.pdf
Processed: 25-ASAP-communication-doc.pdf
Processed: 14-EDIRPA-work-programme-annex.pdf
Processed: 5-gdpr.pdf
Process

### Keywords

In [133]:
# Canonical keyword groups with data exchange in defence-specific terms
keyword_map = {
    "defence": ["defence", "defense"],
    "military": ["military", "armed forces"],
    "security": ["security", "secure", "securing"],
    "data exchange": ["data exchange", "information exchange"],
    "data sharing": ["data sharing", "information sharing"],
    "interoperability": ["interoperability", "inter-operability", "interoperable"],
    "sovereignty": ["sovereignty", "sovereign"],
    "trust": ["trust", "trusted", "trustworthy"],
    "privacy": ["privacy", "private", "confidentiality"],
    "data exchange in defence": [
        "military data exchange", "defence data exchange", "interoperability of defence systems",
        "defence data sharing", "military interoperability", "secure communication systems",
        "tactical data link", "coalition data exchange", "battlefield information sharing",
        "command and control interoperability", "defence information sharing", "military information sharing",
        "defence information exchange"
    ]
}

# Context-based exclusion for "private"
exclude_if_near = {"sector", "company", "organisation", "organization", "entity", "enterprise", "partner"}

results = []

for filename in os.listdir("data"):
    if filename.endswith(".pdf"):
        reader = PdfReader(os.path.join("data", filename))
        full_text = "\n".join([page.extract_text() or "" for page in reader.pages]).lower()

        # Process with spaCy
        doc = nlp(full_text)
        lemmatized_tokens = [token.lemma_ for token in doc if token.is_alpha]
        text_for_phrases = doc.text.lower()

        doc_result = {"document": filename}

        for canon, variants in keyword_map.items():
            if canon == "privacy":
                # Special handling for "private"
                privacy_count = 0
                for i, token in enumerate(doc):
                    if token.lemma_ == "private":
                        context = doc[max(0, i - 3): i + 4]
                        context_words = {tok.lemma_ for tok in context}
                        if not context_words & exclude_if_near:
                            privacy_count += 1

                # Add additional phrases like "privacy", "confidentiality"
                phrase_matches = sum(
                    len(re.findall(re.escape(phrase), text_for_phrases))
                    for phrase in variants if phrase != "private"
                )

                total = privacy_count + phrase_matches
                doc_result[canon] = total

            else:
                # Count lemmatized token matches
                token_matches = sum(1 for lemma in lemmatized_tokens if lemma in variants)
                # Count phrase matches
                phrase_matches = sum(
                    len(re.findall(re.escape(phrase), text_for_phrases))
                    for phrase in variants if " " in phrase
                )
                total = token_matches + phrase_matches
                doc_result[canon] = total

        results.append(doc_result)

# Output as DataFrame
df_fuzzy_keywords = pd.DataFrame(results)
df_fuzzy_keywords.set_index("document", inplace=True)
df_fuzzy_keywords

Unnamed: 0_level_0,defence,military,security,data exchange,data sharing,interoperability,sovereignty,trust,privacy,data exchange in defence
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2-shaping-europes-digital-future.pdf,0,0,14,0,1,3,2,15,6,0
15-EU-ideas-data-driven-economy.pdf,0,0,17,1,1,6,1,14,9,0
23-european-single-access-point.pdf,0,0,10,0,0,2,0,2,3,0
19-digital-decade-policy-programme-2030.pdf,2,0,19,0,0,2,4,3,16,0
7-edf-work-programme-2025.pdf,188,28,19,0,1,12,1,0,7,0
1-european-strategy-for-data.pdf,0,0,26,0,25,24,2,16,17,0
8-edf-2025-call-topic-descriptions.pdf,459,127,116,14,7,116,17,9,14,5
30-SAFE-regulation.pdf,198,13,68,0,0,8,1,0,5,0
13-EDIRPA-implementing-decision.pdf,5,0,0,0,0,0,0,0,0,0
17-digital-markets-act.pdf,1,0,16,1,0,32,0,0,9,0


In [154]:
df_fuzzy_keywords.to_excel("keywords_documents1.xlsx")
