# Data preprocessing

## Download pdfs

In [11]:
files_year = 2005

In [12]:
import os
import json

directory = './files/'
os.makedirs(directory, exist_ok=True)

In [13]:
data_file_path = f'./files/{str(files_year)}/data.json'

In [14]:
def init_file(): 
    if not os.path.exists(data_file_path):
        with open(data_file_path, 'w') as f:
            json.dump([], f)

def append_to_json_file(nex_data):
    init_file()
    with open(data_file_path, 'r') as f:
        existing_data = json.load(f)
    existing_data.append(nex_data)
    with open(data_file_path, 'w') as f:
        json.dump(existing_data, f, indent=2)
    

In [15]:
import requests
from bs4 import BeautifulSoup
import re

links = []

def get_pdfs(year):
    url = f"https://www.joradp.dz/JRN/ZF{year}.htm"
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')

    pdf_links = soup.find_all('a', href=re.compile(r"MaxWin\('(\d+)'\)"))

    for link in pdf_links:
        html_tag = str(link)
        match = re.search(r"MaxWin\('(\d+)'\)", html_tag)
        if match:
            value = match.group(1)
            links.append(f"https://www.joradp.dz/FTP/JO-FRANCAIS/{year}/F{year}{value}.pdf")

In [16]:
get_pdfs(files_year)



In [17]:
os.makedirs('./files/' + str(files_year), exist_ok=True)

In [18]:
from pathlib import Path

files = []

for link in links:
    response = requests.get(link)
    files.append(f'./files/{str(files_year)}/{link.split('/')[-1]}')

    file_name = Path(f'./files/{str(files_year)}/{link.split('/')[-1]}')
    file_name.write_bytes(response.content)

In [19]:
from pathlib import Path

files = []

for link in links:
    response = requests.get(link)
    file_path = f'./files/{str(files_year)}/{link.split("/")[-1]}'
    files.append(file_path)

    # Ensure file is explicitly closed after writing
    with open(file_path, "wb") as f:
        f.write(response.content)  # Write PDF file properly


In [20]:
files

['./files/2005/F2005001.pdf',
 './files/2005/F2005002.pdf',
 './files/2005/F2005003.pdf',
 './files/2005/F2005004.pdf',
 './files/2005/F2005005.pdf',
 './files/2005/F2005006.pdf',
 './files/2005/F2005007.pdf',
 './files/2005/F2005008.pdf',
 './files/2005/F2005009.pdf',
 './files/2005/F2005010.pdf',
 './files/2005/F2005011.pdf',
 './files/2005/F2005012.pdf',
 './files/2005/F2005013.pdf',
 './files/2005/F2005014.pdf',
 './files/2005/F2005015.pdf',
 './files/2005/F2005016.pdf',
 './files/2005/F2005017.pdf',
 './files/2005/F2005018.pdf',
 './files/2005/F2005019.pdf',
 './files/2005/F2005020.pdf',
 './files/2005/F2005021.pdf',
 './files/2005/F2005022.pdf',
 './files/2005/F2005023.pdf',
 './files/2005/F2005024.pdf',
 './files/2005/F2005025.pdf',
 './files/2005/F2005026.pdf',
 './files/2005/F2005027.pdf',
 './files/2005/F2005028.pdf',
 './files/2005/F2005029.pdf',
 './files/2005/F2005030.pdf',
 './files/2005/F2005031.pdf',
 './files/2005/F2005032.pdf',
 './files/2005/F2005033.pdf',
 './files/

## Parsing

### Create metadata

In [27]:
import re
from os.path import splitext


def create_metadata(result, doc, link, doc_id):
    page = doc[0]
    output = page.get_text("blocks")

    #journal_number = output[0][4].strip().split()[-1]  
    match = re.search(r'(\d{3})\.pdf$', doc_id)
    journal_number = match.group(1)
    # Find the block containing "Correspondant" and the previous block
    correspondant_block = None
    previous_block = None

    for i in range(1, len(output)):
        if "Correspondant" in output[i][4] or "Correspondant" in output[i][4] or "correspondant" in output[i][4] :
            correspondant_block = output[i][4].strip()
            previous_block = output[i-1][4].strip()
            break
        else :
            correspondant_block = "UNKOWN"
            previous_block = "UNKOWN"
            
    #gregorian_pattern = r'(\d{1,2}\s+\w+\s+\d{4})' 
    gregorian_pattern = r'(\d{1,2}(?:er)?\s+\w+\s+\d{4})'
    gregorian_match = re.search(gregorian_pattern, correspondant_block)
    gregorian_date = gregorian_match.group(1) if gregorian_match else None
    
    result["metadata"] = {
        "document_id": splitext(doc_id)[0],
        "domain" : "",
        "year": files_year,
        "journal_number": journal_number,
        "hijri_date": previous_block,
        "gregorian_date": gregorian_date,
        "document_link": link
    }


### Extract data

In [28]:
import fitz 
import os
import re 

type_dictionary = {
    "CONVENTIONS ET ACCORDS INTERNATIONAUX", "LOIS", "DECRETS","DECISIONS",
    "ARRETES, DECISIONS ET AVIS", "DECISIONS INDIVIDUELLES", "DECISIONS ET AVIS",
    "COMMUNICATIONS", "ANNONCES","AVIS","AVIS ET LOIS","REGLEMENTS INTERIEURS"
}
annex_pattern = re.compile(r'\bANNEXE\b') 

def verify_title_in_beginning(blocks):
    if not blocks:
        return blocks

    last_block_text = blocks[-1][4].strip()
    if last_block_text and last_block_text in type_dictionary or annex_pattern.search(last_block_text):
        reordered_blocks = [blocks[-1]] + blocks[:-1]
        return reordered_blocks
    elif len(blocks) >= 2:
        before_last_block = blocks[-2]
        before_last_text = before_last_block[4].strip()
        if before_last_text and before_last_text in type_dictionary or annex_pattern.search(before_last_text):
            reordered_blocks = [blocks[-2]] + blocks[:-2] + blocks[-1:]
            return reordered_blocks
    return blocks
def check_sommaire(page_blocks):
    for block in page_blocks:
        if "SOMMAIRE" in block[4].replace(" ", "").strip():
            return True
    return False        
            
def extract_pdf_blocks(result,doc):
    global_blocks = []
    for page in doc[1:]:
        page_blocks = page.get_text("blocks")
        if page_blocks and not check_sommaire(page_blocks):
            # Exclude the header in each page and verify/reorder blocks
            page_blocks_ordered = verify_title_in_beginning(page_blocks[1:])
            filtered_blocks = []
            annex_found = False
            for block in page_blocks_ordered:
                if annex_pattern.search(block[4]) or annex_found:
                    start_printing = True
                    break
                filtered_blocks.append(block)   
            global_blocks.extend(filtered_blocks)
    
    # Remove the last block of the last page (footer)
    global_blocks = global_blocks[:-1]
    
    # Convert blocks to text
    full_text = "\n".join([block[4].strip() for block in global_blocks if block[4].strip()])
    result["content"] = full_text

In [31]:
result= []
for file in files:
    file_content = {
    "metadata": {},
    "content": ""
    }
    pdf_path = file
    print(file)
    doc = fitz.open(pdf_path)
    link = f"https://www.joradp.dz/FTP/JO-FRANCAIS/{files_year}/{file.split('/')[-1]}"
    doc_id = file.split('/')[-1]
    with fitz.open(pdf_path) as doc:  # Auto-closes after block
        create_metadata(file_content, doc, link, doc_id)
        extract_pdf_blocks(file_content, doc)
    result.append(file_content)


./files/2005/F2005001.pdf
./files/2005/F2005002.pdf
./files/2005/F2005003.pdf
./files/2005/F2005004.pdf
./files/2005/F2005005.pdf
./files/2005/F2005006.pdf
./files/2005/F2005007.pdf
./files/2005/F2005008.pdf
./files/2005/F2005009.pdf
./files/2005/F2005010.pdf
./files/2005/F2005011.pdf
./files/2005/F2005012.pdf
./files/2005/F2005013.pdf
./files/2005/F2005014.pdf
./files/2005/F2005015.pdf
./files/2005/F2005016.pdf
./files/2005/F2005017.pdf
./files/2005/F2005018.pdf
./files/2005/F2005019.pdf
./files/2005/F2005020.pdf
./files/2005/F2005021.pdf
./files/2005/F2005022.pdf
./files/2005/F2005023.pdf
./files/2005/F2005024.pdf
./files/2005/F2005025.pdf
./files/2005/F2005026.pdf
./files/2005/F2005027.pdf
./files/2005/F2005028.pdf
./files/2005/F2005029.pdf
./files/2005/F2005030.pdf
./files/2005/F2005031.pdf
./files/2005/F2005032.pdf
./files/2005/F2005033.pdf
./files/2005/F2005034.pdf
./files/2005/F2005035.pdf
./files/2005/F2005036.pdf
./files/2005/F2005037.pdf
./files/2005/F2005038.pdf
./files/2005

In [32]:
import os
import json

directory = f"files/{files_year}/"
os.makedirs(directory, exist_ok=True)  

with open(f"{directory}/output_{files_year}.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)