In [12]:
files_year = 2006

In [13]:
import os
import json

directory = './files/'
os.makedirs(directory, exist_ok=True)

In [14]:
data_file_path = f'./files/{str(files_year)}/data.json'

In [15]:
def init_file(): 
    if not os.path.exists(data_file_path):
        with open(data_file_path, 'w') as f:
            json.dump([], f)

def append_to_json_file(nex_data):
    init_file()
    with open(data_file_path, 'r') as f:
        existing_data = json.load(f)
    existing_data.append(nex_data)
    with open(data_file_path, 'w') as f:
        json.dump(existing_data, f, indent=2)
    

### web scraping (get pdf files)

In [16]:
import requests
from bs4 import BeautifulSoup
import re

links = []

def get_pdfs(year):
    url = f"https://www.joradp.dz/JRN/ZF{year}.htm"
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')

    pdf_links = soup.find_all('a', href=re.compile(r"MaxWin\('(\d+)'\)"))

    for link in pdf_links:
        html_tag = str(link)
        match = re.search(r"MaxWin\('(\d+)'\)", html_tag)
        if match:
            value = match.group(1)
            links.append(f"https://www.joradp.dz/FTP/JO-FRANCAIS/{year}/F{year}{value}.pdf")

In [17]:
get_pdfs(files_year)



In [18]:
links

['https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006001.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006002.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006003.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006004.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006005.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006006.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006007.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006008.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006009.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006010.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006011.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006012.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006013.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006014.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006015.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/2006/F2006016.pdf',
 'https://www.joradp.dz/FTP/JO-FRANCAIS/

## download pdfs

In [19]:
os.makedirs('./files/' + str(files_year), exist_ok=True)

In [20]:
from pathlib import Path

files = []

for link in links:
    response = requests.get(link)
    files.append(f'./files/{str(files_year)}/{link.split('/')[-1]}')

    file_name = Path(f'./files/{str(files_year)}/{link.split('/')[-1]}')
    file_name.write_bytes(response.content)

In [21]:
files

['./files/2006/F2006001.pdf',
 './files/2006/F2006002.pdf',
 './files/2006/F2006003.pdf',
 './files/2006/F2006004.pdf',
 './files/2006/F2006005.pdf',
 './files/2006/F2006006.pdf',
 './files/2006/F2006007.pdf',
 './files/2006/F2006008.pdf',
 './files/2006/F2006009.pdf',
 './files/2006/F2006010.pdf',
 './files/2006/F2006011.pdf',
 './files/2006/F2006012.pdf',
 './files/2006/F2006013.pdf',
 './files/2006/F2006014.pdf',
 './files/2006/F2006015.pdf',
 './files/2006/F2006016.pdf',
 './files/2006/F2006017.pdf',
 './files/2006/F2006018.pdf',
 './files/2006/F2006019.pdf',
 './files/2006/F2006020.pdf',
 './files/2006/F2006021.pdf',
 './files/2006/F2006022.pdf',
 './files/2006/F2006023.pdf',
 './files/2006/F2006024.pdf',
 './files/2006/F2006025.pdf',
 './files/2006/F2006026.pdf',
 './files/2006/F2006027.pdf',
 './files/2006/F2006028.pdf',
 './files/2006/F2006029.pdf',
 './files/2006/F2006030.pdf',
 './files/2006/F2006031.pdf',
 './files/2006/F2006032.pdf',
 './files/2006/F2006033.pdf',
 './files/

In [None]:
# 2001: files[6], files[38], files[61], 
# 2002: files[2]
# 2003: files[2]
# 2004: files[10]
# 2005: files[8], files[30], files[63]
# 2006: files[4]

In [24]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

for pdf_file in files[5: ]:
    images = convert_from_path(pdf_file, dpi=300) 

    print(pdf_file)
    tmp_data = {
        'pages': '',
        'content': []
    }

    for i, img in enumerate(images):
        text = pytesseract.image_to_string(img, lang="fra")
        tmp_data['pages'] = i + 1
        tmp_data['content'].append(text)
    append_to_json_file(tmp_data)

./files/2006/F2006006.pdf
./files/2006/F2006007.pdf
./files/2006/F2006008.pdf
./files/2006/F2006009.pdf
./files/2006/F2006010.pdf
./files/2006/F2006011.pdf
./files/2006/F2006012.pdf
./files/2006/F2006013.pdf
./files/2006/F2006014.pdf
./files/2006/F2006015.pdf
./files/2006/F2006016.pdf
./files/2006/F2006017.pdf
./files/2006/F2006018.pdf
./files/2006/F2006019.pdf
./files/2006/F2006020.pdf
./files/2006/F2006021.pdf
./files/2006/F2006022.pdf
./files/2006/F2006023.pdf
./files/2006/F2006024.pdf
./files/2006/F2006025.pdf
./files/2006/F2006026.pdf
./files/2006/F2006027.pdf
./files/2006/F2006028.pdf
./files/2006/F2006029.pdf
./files/2006/F2006030.pdf
./files/2006/F2006031.pdf
./files/2006/F2006032.pdf
./files/2006/F2006033.pdf
./files/2006/F2006034.pdf
./files/2006/F2006035.pdf
./files/2006/F2006036.pdf
./files/2006/F2006037.pdf
./files/2006/F2006038.pdf
./files/2006/F2006039.pdf
./files/2006/F2006040.pdf
./files/2006/F2006041.pdf
./files/2006/F2006042.pdf
./files/2006/F2006043.pdf
./files/2006