In [11]:
# import libraries and packages
import pandas as pd
import numpy as np
import gensim
from docx import Document
import google.cloud.aiplatform
import faiss
import numpy

In [12]:
from google.cloud import storage
import pdfplumber

def download_blob(bucket_name, source_blob_name, destination_file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

download_blob('dxc_tech_1a', 'EU AI Act.pdf', 'EU AI Act.pdf')

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

text = extract_text_from_pdf('EU AI Act.pdf')

PermissionError: [Errno 13] Permission denied: 'EU AI Act.pdf'

In [3]:
# remove unwanted text #
import re

# remove emails
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

# remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# normalize white space
text = re.sub(r'\s+', ' ', text).strip()

In [4]:
# text tokenization #
from gensim.utils import simple_preprocess

# tokenize the text
tokens = simple_preprocess(text)

In [5]:
# stop words removal #
from nltk.corpus import stopwords
import nltk

# load built-in stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove stop words
tokens = [token for token in tokens if token not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jenni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# lemmatization #
!python -m spacy download en_core_web_md
import spacy

# load spacey model
nlp = spacy.load('en_core_web_md')

# lemmatize using spacey
text = ' '.join(tokens)
doc = nlp(text)
tokens = [token.lemma_ for token in doc]

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 330.3 kB/s eta 0:02:10
     --------------------------------------- 0.1/42.8 MB 409.6 kB/s eta 0:01:45
     --------------------------------------- 0.1/42.8 MB 654.9 kB/s eta 0:01:06
     ---------------------------------------- 0.5/42.8 MB 2.2 MB/s eta 0:00:20
      --------------------------------------- 1.0/42.8 MB 4.0 MB/s eta 0:00:11
     - -------------------------------------- 2.1/42.8 MB 6.7 MB/s eta 0:00:07
     --- ------------------------------------ 3.3/42.8 MB 9.6 MB/s eta 0:00:05
     --- ------------------------------------ 4.0/42

In [None]:
import re
from datetime import datetime

def extract_and_categorize_sections(text, keywords):
    # Create regex patterns for different sections
    keyword_patterns = {keyword: re.compile(rf'^({re.escape(keyword)} \d*[:.]?\s.*)', re.MULTILINE) for keyword in keywords}

    # Initialize containers for different sections
    data = {keyword: {"headers": [], "sections": []} for keyword in keywords}

    last_pos = 0

    # Iterate through each keyword pattern to extract sections
    for keyword in keywords:
        pattern = keyword_patterns[keyword]
        matches = pattern.finditer(text)

        for match in matches:
            header = match.group(1)
            start_pos = match.start()
            
            if start_pos > last_pos:
                # Extract text between the last header and the current one
                previous_text = text[last_pos:start_pos].strip()
                
                # Determine which section this text belongs to
                for keyword_check in keywords:
                    if keyword_check in header:
                        data[keyword_check]["sections"].append(previous_text)
                        break
            
            data[keyword]["headers"].append(header)
            last_pos = match.end()
    
    # Add the remaining text after the last header
    if last_pos < len(text):
        remaining_text = text[last_pos:].strip()
        for keyword in keywords:
            if any(keyword in header for header in data[keyword]["headers"]):
                data[keyword]["sections"].append(remaining_text)
                break

    # Add a timestamp
    data["created_at"] = datetime.utcnow().isoformat()
    
    return data

# Define the keywords for headers
keywords = ["chapter", "article", "annex"]

categorized_data = extract_and_categorize_sections(text, keywords)

# Print categorized data for verification
for keyword in keywords:
    print(f"\n{keyword}s:")
    for header, section in zip(categorized_data[keyword]["headers"], categorized_data[keyword]["sections"]):
        print(f"Header: {header}")
        print(f"Section: {section[:500]}...")

In [None]:
from getpass import getpass
from elasticsearch import Elasticsearch

# Create a connection to Elasticsearch
CLOUD_ID = getpass("Elastic deployment Cloud ID")

CLOUD_PASSWORD = getpass("Elastic deployment Password")

client = Elasticsearch(
  cloud_id = CLOUD_ID,
  basic_auth=("elastic", CLOUD_PASSWORD) # Alternatively use `api_key` instead of `basic_auth`
)

# Test connection to Elasticsearch
print(client.info())