In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import unicodedata
from functools import reduce

import pandas as pd
from camelot.core import Table
from camelot.io import read_pdf


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [None]:
import requests as r

r.get("")

In [3]:
def replace_newlines(text: str | None, replace_with: str = "") -> str:
    return re.sub(r"\n", replace_with, text) if text else ""


def normalize_header_names(text: str | None) -> str:
    if text is None:
        return ""

    text = text.strip().lower()
    parts = text.split("/")
    text = parts[-1].strip()
    text = re.sub(r"\s+", "_", text)

    normalized = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in normalized if not unicodedata.combining(c))

    return text


def parse_header_table(table: Table) -> pd.Index:
    df = table.df
    df = df.map(replace_newlines)
    df = df.map(normalize_header_names)
    header = pd.Index(df.iloc[0])

    return header


def extract_reg_number(
    df_subset: pd.DataFrame, pattern: str = r".*OM\ *-\ *(.*)"
) -> pd.Series:
    extracted = [
        df_subset[col]
        .astype(str)
        .str.upper()
        .str.replace(r"\s+", " ")
        .str.extract(pattern, expand=False)
        for col in df_subset.columns
    ]

    res = reduce(lambda a, b: a.combine_first(b), extracted)

    return res


def process_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.map(replace_newlines).map(str.strip)

    df["reg_number"] = extract_reg_number(df[["registration_marks", "aircraft_type"]])
    df["country"] = "SK"

    return df


def pdf_to_df(pdf_path: str, pages: str = "all", line_scale: int = 30) -> pd.DataFrame:
    tables = read_pdf(pdf_path, pages=pages, flavor="lattice", line_scale=line_scale)

    header_table, content_tables = tables[0], tables[1::2]
    header = parse_header_table(header_table)

    df = pd.DataFrame(columns=header)

    for table in content_tables:
        df_page = table.df
        df_page.columns = header
        df = pd.concat([df, df_page], ignore_index=True)

    df = process_df(df)

    return df

In [4]:
df = pdf_to_df(
    "registrations_sk.pdf",
    # pages="5,6,7,8,9"
)

FileNotFoundError: [Errno 2] No such file or directory: 'registrations_sk.pdf'

In [96]:
df.head()

Unnamed: 0,aircraft_type,registration_marks,serial_number,owner,operator,zalozne_pravo,reg_number,country
0,"101 A ""Pégase""",OM - 0101,101046,Peter Hupka,Peter Hupka,,101,SK
1,"101 A ""Pégase""",OM - 0329,101A0329,Juraj Knoško,Juraj Knoško,,329,SK
2,"101 A ""Pégase""",OM - 0339,101A0339,RNDr. Peter Manka,RNDr. Peter Manka,,339,SK
3,"101 A ""Pégase""",OM - 2906,101A0215,"Aeroklub Martin, o.z.","Aeroklub Martin, o.z.",,2906,SK
4,"101 A ""Pégase""",OM - 6811,101A0109,Aeroklub Trnava,Aeroklub Trnava,,6811,SK


In [8]:
df = pd.read_csv("registrations/registrations_cz.csv")

In [None]:
df[df["deletion_date"].isna()]

Unnamed: 0,id,category,type,registration_number,deletion_date,registration,country
0,858384,AVREG_DATA.CATEGORIES.GLIDER,"MDM-1 ""Fox""",1213,,OK-1213,CZ
1,858389,AVREG_DATA.CATEGORIES.GLIDER,"MDM-1 ""Fox""",7801,,OK-7801,CZ
2,858395,AVREG_DATA.CATEGORIES.GLIDER,A 15,7906,,OK-7906,CZ
3,858401,AVREG_DATA.CATEGORIES.POWERED_GLIDER,SZD-45A,6902,,OK-6902,CZ
4,858407,AVREG_DATA.CATEGORIES.POWERED_GLIDER,SZD-45A,8903,,OK-8903,CZ
...,...,...,...,...,...,...,...
5301,1876280,AVREG_DATA.CATEGORIES.HOT_AIR_BALLOON,BB,2580,,OK-2580,CZ
5302,1876417,AVREG_DATA.CATEGORIES.AIRPLANE,SR22,VTP,,OK-VTP,CZ
5303,1877935,AVREG_DATA.CATEGORIES.AIRPLANE,Cessna 150,ELM,,OK-ELM,CZ
5304,1882587,AVREG_DATA.CATEGORIES.HOT_AIR_BALLOON,CAMERON HOT AIR BALLOONS,1265,,OK-1265,CZ


In [5]:
from lib import Registrations

In [14]:
regs = Registrations("registrations/registrations.csv")
regs = regs.append_registrations_cz()
regs = regs.append_registrations_sk("registrations/registrations_sk.pdf")

In [16]:
regs.to_csv()