In [2]:
import pandas as pd
import re
from typing import List, Dict, Any, Tuple, Callable
from tqdm.auto import tqdm
from pathlib import Path
import csv
from concurrent.futures import ThreadPoolExecutor
import importlib

from files import *

import json
import numbers
from actions import (
    remove_typos,
    CalculateWeightAndTrimAction,
    CheckinMsgProcessor,
    CreateLoadingInstructionAction,
    CreateLoadsheetAction,
    CreateZFWMessageAction,
    EstimateStorePaxDataAction,
    RampFinalAction,
    SendFuelOrderAction,
    SendLoadingInstructionAction,
    SendLoadsheetAction,
    SetActualBagWeightIndicatorAction,
    SetCKIPaxDistributionAction,
    StoreAircraftDataAction,
    StorePaxDataAction,
    StorePaxDataGuiAction,
    StoreRegistrationAndConfigurationAc,
    TdmCreateLoadingInstructionAction,
    TransferCargoAction,
    TransferCheckinDataAction,
    UpdateEstimatesAction,
    UpdateFuelDataAction,
    UpdateLoadTableAction,
    UpdateTransitLoadTableAction,
)

In [3]:
class Process:
    def __init__(self, func: Callable, **kwargs):
        self.func = func
        self.kwargs = kwargs

    def get_func(self) -> Callable:
        return self.func

    def get_kwargs(self):
        return self.kwargs

In [4]:
def multiprocess(processes: List[Process], workers: int):
    with ThreadPoolExecutor(workers) as executor:
        futures = []
        for process in processes:
            futures.append(executor.submit(process.get_func(), **process.get_kwargs())),

        for future in futures:
            future.result()  # This will re-raise any exceptions that occurred during task execution

# Fix broken CSV files

In [5]:
def fix_broken_csv(csv_input_file, csv_output_file):
    id_timestamp_pattern = re.compile(r"^\d+,\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")

    with open(csv_input_file, "r", encoding="utf-8") as infile, open(
        csv_output_file, "w", encoding="utf-8", newline=""
    ) as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        buffer = []
        first_line = True

        # Read the header from the input file and write it to the output file
        header = next(reader)
        writer.writerow(header)

        for line in infile:
            line = line.rstrip("\n")  # Retain trailing newlines by using rstrip('\n')

            # Check if the line matches the pattern for a new entry
            if id_timestamp_pattern.match(line):
                # If buffer is not empty, process the previous buffered entry
                if not first_line:
                    combined_line = "\n".join(buffer)
                    # Add closing quote if the previous entry was not closed properly
                    if combined_line.count('"') % 2 != 0:
                        combined_line += '"'
                    writer.writerow(csv.reader([combined_line]).__next__())

                # Start a new buffer with the current line
                buffer = [line]
                first_line = False
            else:
                # Continue the buffer
                buffer.append(line)

        # Handle the last buffer if not empty
        if buffer:
            combined_line = "\n".join(buffer)
            # Add closing quote if the last entry was not closed properly
            if combined_line.count('"') % 2 != 0:
                combined_line += '"'
            writer.writerow(csv.reader([combined_line]).__next__())

In [6]:
multiprocess(
    processes=[
        Process(
            fix_broken_csv,
            csv_input_file=CSV_FILE_AB,
            csv_output_file=CSV_FILE_AB_FIXED,
        ),
        Process(
            fix_broken_csv,
            csv_input_file=CSV_FILE_MN,
            csv_output_file=CSV_FILE_MN_FIXED,
        ),
        Process(
            fix_broken_csv,
            csv_input_file=CSV_FILE_ZY,
            csv_output_file=CSV_FILE_ZY_FIXED,
        ),
    ],
    workers=3,
)

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\abcd\\ABCD_tripfiles.csv'

# Read CSV files and convert them to Parquet files


In [None]:
def csv_to_parquet_cleaning(csv_file: Path, parquet_file: Path) -> pd.DataFrame:

    # read the CSV file
    df = pd.read_csv(csv_file)

    # remove leading and trailing linebreaks and whitespaces
    def custom_strip(text):
        if isinstance(text, str):
            return text.strip("\n\r").strip()
        return text

    df = df.map(custom_strip, na_action="ignore")

    # Drop duplicates ignoring the index
    df.drop_duplicates(subset=list(df.columns).remove("id"), inplace=True)

    # Set the unique identifier for every flight
    df["flightid"] = df.apply(
        lambda x: f"{x['airline_code']}_{x['flight_number']}_{x['flight_date']}_{x['departure_airport']}",
        axis=1,
    )

    # Convert creation_time to a datetime object
    df["creation_time"] = pd.to_datetime(df["creation_time"])

    # Write the dataframe to parquet
    df.to_parquet(parquet_file, engine="pyarrow", compression="brotli")
    return df

In [None]:
multiprocess(
    processes=[
        Process(
            csv_to_parquet_cleaning, csv_file=CSV_FILE_AB_FIXED, parquet_file=PARQUET_FILE_AB
        ),
        Process(
            csv_to_parquet_cleaning, csv_file=CSV_FILE_MN_FIXED, parquet_file=PARQUET_FILE_MN
        ),
        Process(
            csv_to_parquet_cleaning, csv_file=CSV_FILE_ZY_FIXED, parquet_file=PARQUET_FILE_ZY
        ),
    ],
    workers=3,
)

# Extract Action Data


In [7]:
import importlib

importlib.reload(remove_typos)
importlib.reload(CalculateWeightAndTrimAction)
importlib.reload(CheckinMsgProcessor)
importlib.reload(CreateLoadingInstructionAction)
importlib.reload(CreateLoadsheetAction)
importlib.reload(CreateZFWMessageAction)
importlib.reload(EstimateStorePaxDataAction)
importlib.reload(RampFinalAction)
importlib.reload(SendFuelOrderAction)
importlib.reload(SendLoadingInstructionAction)
importlib.reload(SendLoadsheetAction)
importlib.reload(SetActualBagWeightIndicatorAction)
importlib.reload(SetCKIPaxDistributionAction)
importlib.reload(StoreAircraftDataAction)
importlib.reload(StorePaxDataAction)
importlib.reload(StorePaxDataGuiAction)
importlib.reload(StoreRegistrationAndConfigurationAc)
importlib.reload(TdmCreateLoadingInstructionAction)
importlib.reload(TransferCargoAction)
importlib.reload(TransferCheckinDataAction)
importlib.reload(UpdateEstimatesAction)
importlib.reload(UpdateFuelDataAction)
importlib.reload(UpdateLoadTableAction)
importlib.reload(UpdateTransitLoadTableAction)

action_extractors = {
    "CalculateWeightAndTrimAction": CalculateWeightAndTrimAction.extract,
    "CheckinMsgProcessor": CheckinMsgProcessor.extract,
    "CreateLoadingInstructionAction": CreateLoadingInstructionAction.extract,
    "CreateLoadsheetAction": CreateLoadsheetAction.extract,
    "CreateZFWMessageAction": CreateZFWMessageAction.extract,
    "EstimateStorePaxDataAction": EstimateStorePaxDataAction.extract,
    "RampFinalAction": RampFinalAction.extract,
    "SendFuelOrderAction": SendFuelOrderAction.extract,
    "SendLoadingInstructionAction": SendLoadingInstructionAction.extract,
    "SendLoadsheetAction": SendLoadsheetAction.extract,
    "SetActualBagWeightIndicatorAction": SetActualBagWeightIndicatorAction.extract,
    "SetCKIPaxDistributionAction": SetCKIPaxDistributionAction.extract,
    "StoreAircraftDataAction": StoreAircraftDataAction.extract,
    "StorePaxDataAction": StorePaxDataAction.extract,
    "StorePaxDataGuiAction": StorePaxDataGuiAction.extract,
    "StoreRegistrationAndConfigurationAc": StoreRegistrationAndConfigurationAc.extract,
    "TdmCreateLoadingInstructionAction": TdmCreateLoadingInstructionAction.extract,
    "TransferCargoAction": TransferCargoAction.extract,
    "TransferCheckinDataAction": TransferCheckinDataAction.extract,
    "UpdateEstimatesAction": UpdateEstimatesAction.extract,
    "UpdateFuelDataAction": UpdateFuelDataAction.extract,
    "UpdateLoadTableAction": UpdateLoadTableAction.extract,
    "UpdateTransitLoadTableAction": UpdateTransitLoadTableAction.extract,
}

In [8]:
def extract_action_data(
    source_file: Path,
    target_file: Path,
    label: str | None = None,
    df: pd.DataFrame | None = None,
) -> pd.DataFrame:
    """Extract specific data based on predefined action extractors, optionally displaying a progress bar and labels.

    This function iterates over a dictionary of action names and their associated extractor functions, applying each
    extractor to the relevant entries in the DataFrame. The results are stored in new columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame from which data will be extracted.
            It must contain columns that match the keys in the action_extractors dictionary.
        progress_bar (bool, optional): If True, displays a progress bar during the data extraction process.
            Useful for visual feedback during long operations. Defaults to False.
        label (str | None, optional): An optional label that prefixes the print statements for better traceability during debugging.
            If None, only the action name is printed. Defaults to None.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns containing the extracted data.
    """
    if df is None:
        df = pd.read_parquet(source_file)

    for action_name, extractor in action_extractors.items():
        if extractor is not None:
            if len(df[df.action_name == action_name]) == 0:
                print(label, action_name, "not found in DataFrame")
                continue

            if label:
                print(label, action_name)
            else:
                print(action_name)

            df[f"data_{action_name}"] = df[df.action_name == action_name][
                "entry_details"
            ].apply(extractor)

    df.to_parquet(target_file, engine="pyarrow", compression="brotli")
    return df

In [9]:
multiprocess(
    processes=[
        Process(
            extract_action_data,
            source_file=PARQUET_FILE_AB,
            target_file=PARQUET_FILE_AB_CONV,
            label="ABCD",
        ),
        Process(
            extract_action_data,
            source_file=PARQUET_FILE_MN,
            target_file=PARQUET_FILE_MN_CONV,
            label="MNOP",
        ),
        Process(
            extract_action_data,
            source_file=PARQUET_FILE_ZY,
            target_file=PARQUET_FILE_ZY_CONV,
            label="ZYXW",
        ),
    ],
    workers=3,
)

ZYXW CalculateWeightAndTrimAction
ABCD CalculateWeightAndTrimAction
MNOP CalculateWeightAndTrimAction
ZYXW CheckinMsgProcessor
ZYXW CreateLoadingInstructionAction
ZYXW CreateLoadsheetAction
ZYXW CreateZFWMessageAction
ZYXW EstimateStorePaxDataAction
ZYXW RampFinalAction
ZYXW SendFuelOrderAction
ZYXW SendLoadingInstructionAction
ZYXW SendLoadsheetAction
ZYXW SetActualBagWeightIndicatorAction
ZYXW SetCKIPaxDistributionAction
ZYXW StoreAircraftDataAction
ABCDZYXW StorePaxDataAction
 CheckinMsgProcessor
ZYXW StorePaxDataGuiAction
ZYXW StoreRegistrationAndConfigurationAc
ZYXW TdmCreateLoadingInstructionAction
ZYXW TransferCargoAction
ZYXW TransferCheckinDataAction not found in DataFrame
ZYXW UpdateEstimatesAction not found in DataFrame
ZYXW UpdateFuelDataAction
ABCD CreateLoadingInstructionAction
ZYXW UpdateLoadTableAction
ABCD CreateLoadsheetAction
ZYXW UpdateTransitLoadTableAction
ABCD CreateZFWMessageAction
ABCD EstimateStorePaxDataAction
ABCD RampFinalAction
ABCD SendFuelOrderAction not

# Extract Weight Data

In [10]:
class Weight:
    def __init__(self, key, desc=None):
        self.key = key
        self.description = desc

    def get_key(self):
        return self.key

    def get_description(self):
        return self.description

In [11]:
class WeightCluster:
    def __init__(self, name: str, weights: Dict[Weight, str | Tuple[str, ...]]):
        self._name = name
        self._weights = weights

    def get_weights(self):
        return self._weights

In [12]:
class Action:
    def __init__(self, name: str, weights: Dict[Weight, str | Tuple[str, ...]]):
        self._name = name
        self._weights = weights

    def get_name(self):
        return self._name

    def get_weights(self):
        return self._weights

    def get_weight_items(
        self,
    ):
        return self._weights.items()

In [13]:
EZFW = Weight(key="EZFW", desc="Estimated Zero Fuel Weight")
AZFW = Weight(key="AZFW", desc="Actual Zero Fuel Weight")

ETOW = Weight(key="ETOW", desc="Estimated Takeoff Weight")
ATOW = Weight(key="ATOW", desc="Actual Takeoff Weight")

ETTL = Weight(key="ETTL", desc="Estimated Traffic Load")
ATTL = Weight(key="ATTL", desc="Actual Total Traffic Load")

DOW = Weight(key="DOW", desc="Dry Operating Weight")
MEW = Weight(key="MEW", desc="Manufacturers Empty Weight")

TAOF = Weight(key="TAOF", desc="Take Off Fuel")
TRIF = Weight(key="TRIF", desc="Trip Fuel")
TAXF = Weight(key="TAXF", desc="Taxi Fuel")

ALAW = Weight(key="ALAW", desc="Actual Landing Weight")

PAXW = Weight(key="PAXW", desc="Passenger Weight")
BAGW = Weight(key="BAGW", desc="Baggage Weight")
CARW = Weight(key="CARW", desc="Cargo Weight")
MAIW = Weight(key="MAIW", desc="Mail Weight")
EICW = Weight(key="EICW", desc="Equipment In Compartment Weight")

In [14]:
LOADSHEETACTION = WeightCluster(
    name="LOADSHEETACTION",
    weights={
        ATTL: "TOTAL TRAFFIC LOAD",
        DOW: "DRY OPERATING WEIGHT",
        AZFW: "ZERO FUEL WEIGHT ACTUAL",
        TAOF: "TAKE OFF FUEL",
        ATOW: "TAKE OFF WEIGHT ACTUAL",
        TRIF: "TRIP",
        ALAW: "LANDING WEIGHT ACTUAL",
    },
)

CALCULATEWEIGHTANDTRIMACTION = WeightCluster(
    name="CALCULATEWEIGHTANDTRIMACTION",
    weights={
        MEW: "START_WI weight",
        DOW: "DO_WI weight",
        PAXW: "PAX_WI weight",
        AZFW: "AZFW",
        EZFW: "ESTIMATED_ZFW",
        ATOW: "ATOW",
        ALAW: "ALAW",
        ETTL: "ESTIMATED_TRAFFIC_LOAD",
        ATTL: "TOTAL_TRAFFIC_LOAD",
    },
)

CreateZFWMessageAction = WeightCluster(
    name="CreateZFWMessageAction",
    weights={
        DOW: "dryOperatingWeight",
        AZFW: "actualZFW",
        CARW: "cargoWeight",
        BAGW: "baggageWeight",
        PAXW: "paxWeight",
        DOW: "basicWeight",
    },
)

StoreRegistrationAndConfigurationAc = WeightCluster(
    name="StoreRegistrationAndConfigurationAc",
    weights={MEW: "start_weight", DOW: "basic_empty_weight"},
)

TOTALS = WeightCluster(
    name="TOTALS",
    weights={
        BAGW: "Total baggage",
        CARW: "Total cargo",
        MAIW: "Total mail",
        EICW: "Total EIC",
    },
)

UpdateEstimatesAction = WeightCluster(
    name="UpdateEstimatesAction",
    weights={
        PAXW: "Pax Weight",
        BAGW: "Bag Weight",
        CARW: "Cargo",
        MAIW: "Mail",
        ETTL: "Traffic Load",
        DOW: "DOW",
        EZFW: "EZFW",
    },
)
UpdateFuelDataAction = WeightCluster(
    name="UpdateFuelDataAction",
    weights={
        TAOF: "take_off_fuel",
        TRIF: "trip_fuel",
        TAXF: "taxi_fuel",
    },
)

In [15]:
ACTION_WEIGHTS = [
    # Action(name="RampFinalAction", weights={EZFW: "EZFW"}), # EZFW is not a value just a status
    Action(
        name="CalculateWeightAndTrimAction",
        weights=CALCULATEWEIGHTANDTRIMACTION.get_weights(),
    ),
    Action(name="CreateZFWMessageAction", weights=CreateZFWMessageAction.get_weights()),
    Action(name="CreateLoadsheetAction", weights=LOADSHEETACTION.get_weights()),
    Action(name="SendLoadsheetAction", weights=LOADSHEETACTION.get_weights()),
    Action(
        name="StoreRegistrationAndConfigurationAc",
        weights=StoreRegistrationAndConfigurationAc.get_weights(),
    ),
    Action(name="TransferCargoAction", weights=TOTALS.get_weights()),
    Action(name="UpdateFuelDataAction", weights=UpdateFuelDataAction.get_weights()),
    Action(name="UpdateLoadTableAction", weights=TOTALS.get_weights()),
    Action(name="UpdateTransitLoadTableAction", weights=TOTALS.get_weights()),
]

In [16]:
# Recursive function to find the value for a given key
def find_value(data: dict | list, key: str):
    if isinstance(data, dict):
        for k, v in data.items():
            if k == key:
                if v is None:
                    return None

                if isinstance(v, numbers.Number):
                    return v
                if isinstance(v, str):
                    if v.lower() == "null":
                        return None

                    try:
                        return eval(v)
                    except:
                        raise ValueError(
                            "Value not a number string or a number", key, v
                        )

                return None
            else:
                found = find_value(v, key)
                if found is not None:
                    return found
    elif isinstance(data, list):
        for item in data:
            found = find_value(item, key)
            if found is not None:
                return found
    return None


# Function to apply the recursive search to JSON data
def extract_key(json_str: str, key: str):
    data = json.loads(json_str)
    return find_value(data, key)

In [17]:
def extract_weights(
    source_file: Path,
    target_file: Path,
    label: str | None = None,
    df: pd.DataFrame | None = None,
) -> pd.DataFrame:
    """Extract specific data based on predefined action extractors, optionally displaying a progress bar and labels.

    This function iterates over a dictionary of action names and their associated extractor functions, applying each
    extractor to the relevant entries in the DataFrame. The results are stored in new columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame from which data will be extracted.
            It must contain columns that match the keys in the action_extractors dictionary.
        progress_bar (bool, optional): If True, displays a progress bar during the data extraction process.
            Useful for visual feedback during long operations. Defaults to False.
        label (str | None, optional): An optional label that prefixes the print statements for better traceability during debugging.
            If None, only the action name is printed. Defaults to None.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns containing the extracted data.
    """
    if df is None:
        df = pd.read_parquet(source_file)
    df = df.replace({None: pd.NA})
    for action in ACTION_WEIGHTS:
        print(label, action.get_name())
        if not f"data_{action.get_name()}" in list(df.columns):
            continue
        for weight, key in action.get_weight_items():

            mask = (df.action_name == action.get_name()) & (
                ~df[f"data_{action.get_name()}"].isna()
            )

            # Use loc to update the DataFrame directly
            df.loc[mask, weight.get_key()] = df.loc[
                mask, f"data_{action.get_name()}"
            ].apply(lambda x: extract_key(x, key))

    df.to_parquet(target_file, engine="pyarrow", compression="brotli")
    return df

In [18]:
multiprocess(
    processes=[
        Process(
            extract_weights,
            source_file=PARQUET_FILE_AB_CONV,
            target_file=PARQUET_FILE_AB_WEIGHTS,
            label="ABCD",
        ),
        Process(
            extract_weights,
            source_file=PARQUET_FILE_MN_CONV,
            target_file=PARQUET_FILE_MN_WEIGHTS,
            label="MNOP",
        ),
        Process(
            extract_weights,
            source_file=PARQUET_FILE_ZY_CONV,
            target_file=PARQUET_FILE_ZY_WEIGHTS,
            label="ZYXW",
        ),
    ],
    workers=3,
)

ZYXW CalculateWeightAndTrimAction
ABCD CalculateWeightAndTrimAction
MNOP CalculateWeightAndTrimAction
ZYXW CreateZFWMessageAction
ZYXW CreateLoadsheetAction
ZYXW SendLoadsheetAction
ZYXW StoreRegistrationAndConfigurationAc
ZYXW TransferCargoAction
ZYXW UpdateFuelDataAction
ZYXW UpdateLoadTableAction
ZYXW UpdateTransitLoadTableAction
ABCD CreateZFWMessageAction
ABCD CreateLoadsheetAction
ABCD SendLoadsheetAction
ABCD StoreRegistrationAndConfigurationAc
ABCD TransferCargoAction
ABCD UpdateFuelDataAction
ABCD UpdateLoadTableAction
ABCD UpdateTransitLoadTableAction
MNOP CreateZFWMessageAction
MNOP CreateLoadsheetAction
MNOP SendLoadsheetAction
MNOP StoreRegistrationAndConfigurationAc
MNOP TransferCargoAction
MNOP UpdateFuelDataAction
MNOP UpdateLoadTableAction
MNOP UpdateTransitLoadTableAction


# Extract flighttable

In [19]:
# Hilfsfunktion zum Extrahieren von Daten aus JSON-Spalten
def extract_json_data(row, column, keys):
    try:
        if pd.notna(row[column]):
            data = json.loads(row[column])
            for key in keys:
                data = data.get(key, None)
            return data
        return None
    except json.JSONDecodeError:
        return None

In [20]:
new_columns = {
    "estimated_Y": ("data_EstimateStorePaxDataAction", ["estimated_Y"]),
    "estimated_Jump": ("data_EstimateStorePaxDataAction", ["estimated_Jump"]),
    "estimated_Standby": ("data_EstimateStorePaxDataAction", ["estimated_Standby"]),
    "estimated_Male": ("data_EstimateStorePaxDataAction", ["estimated_Male"]),
    "estimated_Female": ("data_EstimateStorePaxDataAction", ["estimated_Female"]),
    "estimated_Child": ("data_EstimateStorePaxDataAction", ["estimated_Child"]),
    "estimated_Infant": ("data_EstimateStorePaxDataAction", ["estimated_Infant"]),
    "estimated_Bags": ("data_EstimateStorePaxDataAction", ["estimated_Bags"]),
    "aircraft_regTailNbr": ("data_CheckinMsgProcessor", ["aircraft_regTailNbr"]),
    "aircraft_Type": ("data_CheckinMsgProcessor", ["aircraft_Type"]),
    "aircraft_configuration": ("data_CheckinMsgProcessor", ["aircraft_configuration"]),
    "airline": ("data_CreateZFWMessageAction", ["airline"]),
    "arrivalStation": ("data_CreateZFWMessageAction", ["arrivalStation"]),
    # "departureStation": ("data_CreateZFWMessageAction", ["departureStation"]),
    "flightDateLocal": ("data_CreateZFWMessageAction", ["flightDateLocal"]),
    "revisionNumber": ("data_CreateZFWMessageAction", ["revisionNumber"]),
    "PAX": ("data_StorePaxDataAction", ["PAX"]),
    "Y": ("data_StorePaxDataAction", ["Y"]),
    "Jump": ("data_StorePaxDataAction", ["Jump"]),
    "Standby": ("data_StorePaxDataAction", ["Standby"]),
    "Male": ("data_StorePaxDataAction", ["Male"]),
    "Female": ("data_StorePaxDataAction", ["Female"]),
    "Infant": ("data_StorePaxDataAction", ["Infant"]),
    "Bags": ("data_StorePaxDataAction", ["Bags"]),
    # 'Flight_Number': ('data_CreateLoadingInstructionAction', ['Flight_Number']),
    # 'Flight_Date': ('data_CreateLoadingInstructionAction', ['Flight_Date']),
    # "Flight_Route_From": (
    #     "data_CreateLoadingInstructionAction",
    #     ["Flight_Route", "From"],
    # ),
    # "Flight_Route_To": ("data_CreateLoadingInstructionAction", ["Flight_Route", "To"]),
}

In [21]:
def extract_flight(
    source_file: Path,
    target_file: Path,
    label: str | None = None,
    df: pd.DataFrame | None = None,
) -> pd.DataFrame:
    print(label, "Extracting flight data")
    if df is None:
        df = pd.read_parquet(source_file)
        # Neue Spalten erstellen und Daten extrahieren
    for new_col, (json_col, json_keys) in new_columns.items():
        df[new_col] = df.apply(
            lambda row: extract_json_data(row, json_col, json_keys), axis=1
        )

    existing_columns = set(df.columns)

    # Erstellen der Aggregations-Dictionary unter Berücksichtigung der vorhandenen Spalten
    agg_dict = {col: "last" for col in new_columns.keys() if col in existing_columns}
    additional_columns = [
        "airline_code",
        "flight_number",
        "flight_suffix",
        "flight_date",
        "departure_airport",
    ]
    agg_dict.update(
        {col: "last" for col in additional_columns if col in existing_columns}
    )

    # flight_suffix-Spalte direkt aus der Parquet-Datei übernehmen
    df_agg = df.groupby("flightid").agg(agg_dict).reset_index()
    print(label, "Writing to parquet")
    df_agg.to_parquet(target_file, engine="pyarrow", compression="brotli")
    return df_agg

In [22]:
multiprocess(
    processes=[
        Process(
            extract_flight,
            source_file=PARQUET_FILE_AB_CONV,
            target_file=PARQUET_FILE_AB_FLIGHTTABLE,
            label="ABCD",
        ),
        Process(
            extract_flight,
            source_file=PARQUET_FILE_MN_CONV,
            target_file=PARQUET_FILE_MN_FLIGHTTABLE,
            label="MNOP",
        ),
        Process(
            extract_flight,
            source_file=PARQUET_FILE_ZY_CONV,
            target_file=PARQUET_FILE_ZY_FLIGHTTABLE,
            label="ZYXW",
        ),
    ],
    workers=3,
)

ABCD Extracting flight data
MNOP Extracting flight data
ZYXW Extracting flight data
ZYXW Writing to parquet
ABCD Writing to parquet
MNOP Writing to parquet


# Add Location Data

In [24]:
import airportsdata

AIRPORTS = airportsdata.load("IATA")  # key is the IATA location code

In [25]:
AIRPORTS_NOT_IN_AIRPORTSDATA = {
    "SSV": {
        "lat": 5.55215,
        "lon": 120.819,
    },
    "JJG": {
        "lat": -28.6744444444,
        "lon": -49.0588888889,
    },
    "EEA": {
        "lat": -27.634167,
        "lon": -50.358333,
    },
    "SMT": {
        "lat": -12.472778,
        "lon": -55.668889,
    },
    "ARX": {
        "lat": -4.568611,
        "lon": -37.804722,
    },
    "LHN": {
        "lat": -19.355278,
        "lon": -40.071389,
    },
}

In [26]:
def airportcode_to_coordinates(airportcode: str) -> Tuple[str, str]:
    if airportcode in AIRPORTS:
        airport = AIRPORTS[airportcode]
        return airport["lat"], airport["lon"]
    elif airportcode in AIRPORTS_NOT_IN_AIRPORTSDATA:
        airport = AIRPORTS_NOT_IN_AIRPORTSDATA[airportcode]
        return airport["lat"], airport["lon"]
    else:
        print(f"Airport code {airportcode} not found in the database")
        return None, None

In [27]:
def extract_airport_loc(
    source_file: Path,
    target_file: Path,
    label: str | None = None,
    df: pd.DataFrame | None = None,
) -> pd.DataFrame:
    print(label, "Extracting airport location data")
    if df is None:
        df = pd.read_parquet(source_file)

    for airport_col in ["departure_airport", "arrivalStation"]:
        # TODO add arrival_airport
        df[airport_col + "_lat"], df[airport_col + "_lon"] = zip(
            *df[airport_col].apply(airportcode_to_coordinates)
        )

    df.to_parquet(target_file, engine="pyarrow", compression="brotli")
    return df

In [28]:
multiprocess(
    processes=[
        Process(
            extract_airport_loc,
            source_file=PARQUET_FILE_AB_FLIGHTTABLE,
            target_file=PARQUET_FILE_AB_FLIGHTTABLE,
            label="ABCD",
        ),
        Process(
            extract_airport_loc,
            source_file=PARQUET_FILE_MN_FLIGHTTABLE,
            target_file=PARQUET_FILE_MN_FLIGHTTABLE,
            label="MNOP",
        ),
        Process(
            extract_airport_loc,
            source_file=PARQUET_FILE_ZY_FLIGHTTABLE,
            target_file=PARQUET_FILE_ZY_FLIGHTTABLE,
            label="ZYXW",
        ),
    ],
    workers=3,
)

ABCD Extracting airport location data
MNOP Extracting airport location data
ZYXW Extracting airport location data
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airport code None not found in the database
Airpor