In [1]:
import pandas as pd
from typing import Dict, List, Tuple
import json
from tqdm.auto import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import numbers

In [2]:
parquet_file_abcd_conv = Path("../data/ABCD_tripfiles_conv.parquet")
parquet_file_mnop_conv = Path("../data/MNOP_tripfiles_conv.parquet")
parquet_file_zyxw_conv = Path("../data/ZYXW_tripfiles_conv.parquet")

parquet_file_abcd_conv_test = Path("../data/ABCD_tripfiles_conv_test.parquet")
parquet_file_mnop_conv_test = Path("../data/MNOP_tripfiles_conv_test.parquet")
parquet_file_zyxw_conv_test = Path("../data/ZYXW_tripfiles_conv_test.parquet")

parquet_file_abcd_weights = Path("../data/ABCD_tripfiles_weights.parquet")
parquet_file_mnop_weights = Path("../data/MNOP_tripfiles_weights.parquet")
parquet_file_zyxw_weights = Path("../data/ZYXW_tripfiles_weights.parquet")

In [3]:
# ab_conv_test = pd.read_parquet(parquet_file_abcd_conv_test)
# mn_conv_test = pd.read_parquet(parquet_file_mnop_conv_test)
# zy_conv_test = pd.read_parquet(parquet_file_zyxw_conv_test)

In [4]:
ab_conv = pd.read_parquet(parquet_file_abcd_conv)
mn_conv = pd.read_parquet(parquet_file_mnop_conv)
zy_conv = pd.read_parquet(parquet_file_zyxw_conv)

In [5]:
class Weight:
    def __init__(self, key, desc=None):
        self.key = key
        self.description = desc

    def get_key(self):
        return self.key

    def get_description(self):
        return self.description

In [6]:
class WeightCluster:
    def __init__(self, name: str, weights: Dict[Weight, str | Tuple[str, ...]]):
        self._name = name
        self._weights = weights

    def get_weights(self):
        return self._weights

In [7]:
class Action:
    def __init__(self, name: str, weights: Dict[Weight, str | Tuple[str, ...]]):
        self._name = name
        self._weights = weights

    def get_name(self):
        return self._name

    def get_weights(self):
        return self._weights

    def get_weight_items(
        self,
    ):
        return self._weights.items()

In [8]:
EZFW = Weight(key="EZFW", desc="Estimated Zero Fuel Weight")
AZFW = Weight(key="AZFW", desc="Actual Zero Fuel Weight")

ETOW = Weight(key="ETOW", desc="Estimated Takeoff Weight")
ATOW = Weight(key="ATOW", desc="Actual Takeoff Weight")

ETTL = Weight(key="ETTL", desc="Estimated Traffic Load")
ATTL = Weight(key="ATTL", desc="Actual Total Traffic Load")

DOW = Weight(key="DOW", desc="Dry Operating Weight")
MEW = Weight(key="MEW", desc="Manufacturers Empty Weight")

TAOF = Weight(key="TAOF", desc="Take Off Fuel")
TRIF = Weight(key="TRIF", desc="Trip Fuel")
TAXF = Weight(key="TAXF", desc="Taxi Fuel")

ALAW = Weight(key="ALAW", desc="Actual Landing Weight")

PAXW = Weight(key="PAXW", desc="Passenger Weight")
BAGW = Weight(key="BAGW", desc="Baggage Weight")
CARW = Weight(key="CARW", desc="Cargo Weight")
MAIW = Weight(key="MAIW", desc="Mail Weight")
EICW = Weight(key="EICW", desc="Equipment In Compartment Weight")

In [9]:
LOADSHEETACTION = WeightCluster(
    name="LOADSHEETACTION",
    weights={
        ATTL: "TOTAL TRAFFIC LOAD",
        DOW: "DRY OPERATING WEIGHT",
        AZFW: "ZERO FUEL WEIGHT ACTUAL",
        TAOF: "TAKE OFF FUEL",
        ATOW: "TAKE OFF WEIGHT ACTUAL",
        TRIF: "TRIP",
        ALAW: "LANDING WEIGHT ACTUAL",
    },
)

CALCULATEWEIGHTANDTRIMACTION = WeightCluster(
    name="CALCULATEWEIGHTANDTRIMACTION",
    weights={
        MEW: "START_WI weight",
        DOW: "DO_WI weight",
        PAXW: "PAX_WI weight",
        AZFW: "AZFW",
        EZFW: "ESTIMATED_ZFW",
        ATOW: "ATOW",
        ALAW: "ALAW",
        ETTL: "ESTIMATED_TRAFFIC_LOAD",
        ATTL: "TOTAL_TRAFFIC_LOAD",
    },
)

CreateZFWMessageAction = WeightCluster(
    name="CreateZFWMessageAction",
    weights={
        DOW: "dryOperatingWeight",
        AZFW: "actualZFW",
        CARW: "cargoWeight",
        BAGW: "baggageWeight",
        PAXW: "paxWeight",
        DOW: "basicWeight",
    },
)

StoreRegistrationAndConfigurationAc = WeightCluster(
    name="StoreRegistrationAndConfigurationAc",
    weights={MEW: "start_weight", DOW: "basic_empty_weight"},
)

TOTALS = WeightCluster(
    name="TOTALS",
    weights={
        BAGW: "Total baggage",
        CARW: "Total cargo",
        MAIW: "Total mail",
        EICW: "Total EIC",
    },
)

UpdateEstimatesAction = WeightCluster(
    name="UpdateEstimatesAction",
    weights={
        PAXW: "Pax Weight",
        BAGW: "Bag Weight",
        CARW: "Cargo",
        MAIW: "Mail",
        ETTL: "Traffic Load",
        DOW: "DOW",
        EZFW: "EZFW",
    },
)
UpdateFuelDataAction = WeightCluster(
    name="UpdateFuelDataAction",
    weights={
        TAOF: "take_off_fuel",
        TRIF: "trip_fuel",
        TAXF: "taxi_fuel",
    },
)

In [10]:
actions = [
    # Action(name="RampFinalAction", weights={EZFW: "EZFW"}), # EZFW is not a value just a status
    Action(
        name="CalculateWeightAndTrimAction",
        weights=CALCULATEWEIGHTANDTRIMACTION.get_weights(),
    ),
    Action(name="CreateZFWMessageAction", weights=CreateZFWMessageAction.get_weights()),
    Action(name="CreateLoadsheetAction", weights=LOADSHEETACTION.get_weights()),
    Action(name="SendLoadsheetAction", weights=LOADSHEETACTION.get_weights()),
    Action(
        name="StoreRegistrationAndConfigurationAc",
        weights=StoreRegistrationAndConfigurationAc.get_weights(),
    ),
    Action(name="TransferCargoAction", weights=TOTALS.get_weights()),
    Action(name="UpdateFuelDataAction", weights=UpdateFuelDataAction.get_weights()),
    Action(name="UpdateLoadTableAction", weights=TOTALS.get_weights()),
    Action(name="UpdateTransitLoadTableAction", weights=TOTALS.get_weights()),
]

In [11]:
# Recursive function to find the value for a given key
def find_value(data: dict | list, key: str):
    if isinstance(data, dict):
        for k, v in data.items():
            if k == key:
                if v is None:
                    return None

                if isinstance(v, numbers.Number):
                    return v
                if isinstance(v, str):
                    if v.lower() == "null":
                        return None

                    try:
                        return eval(v)
                    except:
                        raise ValueError(
                            "Value not a number string or a number", key, v
                        )

                return None
            else:
                found = find_value(v, key)
                if found is not None:
                    return found
    elif isinstance(data, list):
        for item in data:
            found = find_value(item, key)
            if found is not None:
                return found
    return None


# Function to apply the recursive search to JSON data
def extract_key(json_str: str, key: str):
    data = json.loads(json_str)
    return find_value(data, key)

In [21]:
def extract_df(
    df: pd.DataFrame, progress_bar: bool = False, label: str | None = None
) -> pd.DataFrame:
    """Extract specific data based on predefined action extractors, optionally displaying a progress bar and labels.

    This function iterates over a dictionary of action names and their associated extractor functions, applying each
    extractor to the relevant entries in the DataFrame. The results are stored in new columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame from which data will be extracted.
            It must contain columns that match the keys in the action_extractors dictionary.
        progress_bar (bool, optional): If True, displays a progress bar during the data extraction process.
            Useful for visual feedback during long operations. Defaults to False.
        label (str | None, optional): An optional label that prefixes the print statements for better traceability during debugging.
            If None, only the action name is printed. Defaults to None.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns containing the extracted data.
    """

    if progress_bar:
        tqdm.pandas()
    for action in actions:
        print(label, action.get_name())
        if not f"data_{action.get_name()}" in list(df.columns):
            continue
        for weight, key in action.get_weight_items():

            mask = (df.action_name == action.get_name()) & (
                ~df[f"data_{action.get_name()}"].isna()
            )

            # Use loc to update the DataFrame directly
            df.loc[mask, weight.get_key()] = df.loc[
                mask, f"data_{action.get_name()}"
            ].apply(lambda x: extract_key(x, key))

    return df

In [13]:
def process_df(df, file_path, label, progress_bar=False):
    df = df.replace({None: pd.NA})
    df["flightid"] = (
        df["airline_code"].astype(str)
        + "_"
        + df["flight_number"].astype(str)
        + "_"
        + df["flight_date"].astype(str)
        + "_"
        + df["departure_airport"].astype(str)
    )
    df = extract_df(
        df,
        progress_bar=progress_bar,
        label=label,
    )
    df.to_parquet(file_path, engine="pyarrow", compression="brotli")

In [22]:
# Create a ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [
        executor.submit(process_df, ab_conv, parquet_file_abcd_weights, "ABCD"),
        executor.submit(process_df, mn_conv, parquet_file_mnop_weights, "MNOP"),
        executor.submit(process_df, zy_conv, parquet_file_zyxw_weights, "ZYXW"),
    ]

    # Optional: Wait for all futures to complete
    for future in futures:
        future.result()  # This will re-raise any exceptions that occurred during task execution

ZYXW CalculateWeightAndTrimAction
ABCD CalculateWeightAndTrimAction
MNOP CalculateWeightAndTrimAction
ZYXW CreateZFWMessageAction
ZYXW CreateLoadsheetAction
ZYXW SendLoadsheetAction
ZYXW StoreRegistrationAndConfigurationAc
ZYXW TransferCargoAction
ZYXW UpdateFuelDataAction
ZYXW UpdateLoadTableAction
ZYXW UpdateTransitLoadTableAction
ABCD CreateZFWMessageAction
ABCD CreateLoadsheetAction
ABCD SendLoadsheetAction
ABCD StoreRegistrationAndConfigurationAc
ABCD TransferCargoAction
ABCD UpdateFuelDataAction
ABCD UpdateLoadTableAction
ABCD UpdateTransitLoadTableAction
MNOP CreateZFWMessageAction
MNOP CreateLoadsheetAction
MNOP SendLoadsheetAction
MNOP StoreRegistrationAndConfigurationAc
MNOP TransferCargoAction
MNOP UpdateFuelDataAction
MNOP UpdateLoadTableAction
MNOP UpdateTransitLoadTableAction


In [23]:
ab_w = pd.read_parquet(parquet_file_abcd_weights)
mn_w = pd.read_parquet(parquet_file_mnop_weights)
zy_w = pd.read_parquet(parquet_file_zyxw_weights)

In [None]:
mn_w