In [1]:
import pandas as pd

from tqdm.auto import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

In [28]:
# Define the file paths
parquet_file_abcd = Path("../data/ABCD_tripfiles.parquet")
parquet_file_abcd_conv = Path("../data/ABCD_tripfiles_conv.parquet")
parquet_file_mnop = Path("../data/MNOP_tripfiles.parquet")
parquet_file_mnop_conv = Path("../data/MNOP_tripfiles_conv.parquet")
parquet_file_zyxw = Path("../data/ZYXW_tripfiles.parquet")
parquet_file_zyxw_conv = Path("../data/ZYXW_tripfiles_conv.parquet")
print(
    parquet_file_abcd,
    parquet_file_abcd_conv,
    parquet_file_mnop,
    parquet_file_mnop_conv,
    parquet_file_zyxw,
    parquet_file_zyxw_conv,
)

..\data\ABCD_tripfiles.parquet ..\data\ABCD_tripfiles_conv.parquet ..\data\MNOP_tripfiles.parquet ..\data\MNOP_tripfiles_conv.parquet ..\data\ZYXW_tripfiles.parquet ..\data\ZYXW_tripfiles_conv.parquet


In [29]:
df_abcd = pd.read_parquet(parquet_file_abcd)
df_mnop = pd.read_parquet(parquet_file_mnop)
df_zyxw = pd.read_parquet(parquet_file_zyxw)

In [30]:
# print(round(df_abcd.memory_usage(deep=True).sum() / 1024**2, 2), "MB")
# print(round(df_mnop.memory_usage(deep=True).sum() / 1024**2, 2), "MB")
# print(round(df_zyxw.memory_usage(deep=True).sum() / 1024**2, 2), "MB")

In [31]:
from actions import (
    CalculateWeightAndTrimAction,
    CheckinMsgProcessor,
    CreateLoadingInstructionAction,
    CreateLoadSheetAction,
    CreateZFWMessageAction,
    EstimateStorePaxDataAction,
    RampFinalAction,
    SendFuelOrderAction,
    SendLoadingInstructionAction,
    SendLoadsheetAction,
    SetActualBagWeightIndicatorAction,
    SetCKIPaxDistributionAction,
    StoreAircraftDataAction,
    StorePaxDataAction,
    StoreRegistrationAndConfigurationAc,
    TdmCreateLoadingInstructionAction,
    TransferCargoAction,
    TransferCheckinDataAction,
    UpdateEstimatesAction,
    UpdateFuelDataAction,
    UpdateLoadTableAction,
    UpdateTransitLoadTableAction,
)

In [32]:
action_extractors = {
    "CalculateWeightAndTrimAction": CalculateWeightAndTrimAction.extract,
    "CheckinMsgProcessor": CheckinMsgProcessor.extract,
    "CreateLoadingInstructionAction": CreateLoadingInstructionAction.extract,
    "CreateLoadSheetAction": CreateLoadSheetAction.extract,
    "CreateZFWMessageAction": CreateZFWMessageAction.extract,
    "EstimateStorePaxDataAction": EstimateStorePaxDataAction.extract,
    "RampFinalAction": RampFinalAction.extract,
    "SendFuelOrderAction": SendFuelOrderAction.extract,
    "SendLoadingInstructionAction": SendLoadingInstructionAction.extract,
    "SendLoadsheetAction": SendLoadsheetAction.extract,
    "SetActualBagWeightIndicatorAction": SetActualBagWeightIndicatorAction.extract,
    "SetCKIPaxDistributionAction": SetCKIPaxDistributionAction.extract,
    "StoreAircraftDataAction": StoreAircraftDataAction.extract,
    "StorePaxDataAction": StorePaxDataAction.extract,
    "StorePaxDataGuiAction": StorePaxDataAction.extract,
    "StoreRegistrationAndConfigurationAc": StoreRegistrationAndConfigurationAc.extract,
    "TdmCreateLoadingInstructionAction": TdmCreateLoadingInstructionAction.extract,
    "TransferCargoAction": TransferCargoAction.extract,
    "TransferCheckinDataAction": TransferCheckinDataAction.extract,
    "UpdateEstimatesAction": UpdateEstimatesAction.extract,
    "UpdateFuelDataAction": UpdateFuelDataAction.extract,
    "UpdateLoadTableAction": UpdateLoadTableAction.extract,
    "UpdateTransitLoadTableAction": UpdateTransitLoadTableAction.extract,
}

In [33]:
def extract_df(
    df: pd.DataFrame, progress_bar: bool = False, label: str | None = None
) -> pd.DataFrame:
    """Extract specific data based on predefined action extractors, optionally displaying a progress bar and labels.

    This function iterates over a dictionary of action names and their associated extractor functions, applying each
    extractor to the relevant entries in the DataFrame. The results are stored in new columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame from which data will be extracted.
            It must contain columns that match the keys in the action_extractors dictionary.
        progress_bar (bool, optional): If True, displays a progress bar during the data extraction process.
            Useful for visual feedback during long operations. Defaults to False.
        label (str | None, optional): An optional label that prefixes the print statements for better traceability during debugging.
            If None, only the action name is printed. Defaults to None.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns containing the extracted data.
    """

    if progress_bar:
        tqdm.pandas()
    for action_name, extractor in action_extractors.items():
        if extractor is not None:
            if label:
                print(label, action_name)
            else:
                print(action_name)
            if progress_bar:
                df[f"data_{action_name}"] = df[df.action_name == action_name][
                    "entry_details"
                ].progress_apply(extractor)
            else:
                df[f"data_{action_name}"] = df[df.action_name == action_name][
                    "entry_details"
                ].apply(extractor)

    return df

In [35]:
def process_data(df, file_path, label):

    df_conv = extract_df(
        df,
        progress_bar=False,
        label=label,
    )
    df_conv.to_parquet(file_path, engine="pyarrow", compression="brotli")


# Create a ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [
        executor.submit(process_data, df_abcd, parquet_file_abcd_conv, "ABCD"),
        executor.submit(process_data, df_mnop, parquet_file_mnop_conv, "MNOP"),
        executor.submit(process_data, df_zyxw, parquet_file_zyxw_conv, "ZYXW"),
    ]

    # Optional: Wait for all futures to complete
    for future in futures:
        future.result()  # This will re-raise any exceptions that occurred during task execution

ABCD CalculateWeightAndTrimAction
MNOP CalculateWeightAndTrimAction
ZYXW CalculateWeightAndTrimAction
ZYXW CheckinMsgProcessor
ZYXW CreateLoadingInstructionAction
ZYXW CreateLoadSheetAction
ZYXW CreateZFWMessageAction
ZYXW EstimateStorePaxDataAction
ZYXW RampFinalAction
ZYXW SendFuelOrderAction
ZYXW SendLoadingInstructionAction
ZYXW SendLoadsheetAction
ZYXW SetActualBagWeightIndicatorAction
ABCD CheckinMsgProcessor
ZYXW SetCKIPaxDistributionAction
ZYXW StoreAircraftDataAction
ZYXW StorePaxDataAction
ABCD CreateLoadingInstructionAction
ABCD CreateLoadSheetAction
ABCD CreateZFWMessageAction
ABCD EstimateStorePaxDataAction
ZYXW StorePaxDataGuiAction
ABCD RampFinalAction
ZYXW StoreRegistrationAndConfigurationAc
ABCD SendFuelOrderAction
ABCD SendLoadingInstructionAction
ABCD SendLoadsheetAction
ABCD SetActualBagWeightIndicatorAction
ZYXW TdmCreateLoadingInstructionAction
ZYXW TransferCargoAction
ABCD SetCKIPaxDistributionAction
ABCD StoreAircraftDataAction
ABCD StorePaxDataAction
ZYXW Trans

In [None]:
# print("ABCD")
# df_abcd_conv = extract_df(df_abcd, progress_bar=True)
# df_abcd_conv.to_parquet(parquet_file_abcd_conv, engine="pyarrow", compression="brotli")
# print("MNOP")
# df_mnop_conv = extract_df(df_mnop, progress_bar=True)
# df_mnop_conv.to_parquet(parquet_file_mnop_conv, engine="pyarrow", compression="brotli")
# print("ZYXW")
# df_zyxw_conv = extract_df(df_zyxw, progress_bar=True)
# df_zyxw_conv.to_parquet(parquet_file_zyxw_conv, engine="pyarrow", compression="brotli")

In [36]:
df_zyxw.columns

Index(['id', 'creation_time', 'airline_code', 'flight_number', 'flight_date',
       'departure_airport', 'user_name', 'action_name', 'header_line',
       'entry_details', 'data_CalculateWeightAndTrimAction',
       'data_CheckinMsgProcessor', 'data_CreateLoadingInstructionAction',
       'data_CreateLoadSheetAction', 'data_CreateZFWMessageAction',
       'data_EstimateStorePaxDataAction', 'data_RampFinalAction',
       'data_SendFuelOrderAction', 'data_SendLoadingInstructionAction',
       'data_SendLoadsheetAction', 'data_SetActualBagWeightIndicatorAction',
       'data_SetCKIPaxDistributionAction', 'data_StoreAircraftDataAction',
       'data_StorePaxDataAction', 'data_StorePaxDataGuiAction',
       'data_StoreRegistrationAndConfigurationAc',
       'data_TdmCreateLoadingInstructionAction', 'data_TransferCargoAction',
       'data_TransferCheckinDataAction', 'data_UpdateEstimatesAction',
       'data_UpdateFuelDataAction', 'data_UpdateLoadTableAction',
       'data_UpdateTransitLoa

In [31]:
df_zyxw[df_zyxw.flight_date == 13].creation_time.apply(lambda x: str(x)[0:10]).unique()

array(['2024-05-06'], dtype=object)

In [35]:
print(df_abcd.creation_time.apply(lambda x: str(x)[0:10]).unique())
print(df_mnop.creation_time.apply(lambda x: str(x)[0:10]).unique())
print(df_zyxw.creation_time.apply(lambda x: str(x)[0:10]).unique())

['2024-05-01' '2024-05-02' '2024-05-03' '2024-05-04' '2024-05-06'
 '2024-05-05' '2024-04-30' '2024-05-07']
['2024-04-30' '2024-05-01' '2024-05-02' '2024-05-03' '2024-05-04'
 '2024-05-05' '2024-05-06' '2024-05-07']
['2024-04-30' '2024-05-01' '2024-05-02' '2024-05-03' '2024-05-04'
 '2024-05-05' '2024-05-06' '2024-05-07']


In [None]:
# Example on how to effectively filter out different variations of the same message
df = df_abcd


x = df[
    (df.action_name == "UpdateEstimatesAction")
    # & (~df["entry_details"].isnull())
    & (
        ~df["entry_details"].str.contains(
            "com.onesystem.lc2.estimateshandling.dto.EstimateWeightsDTO", na=False
        )
    )  # abcd
    & (
        ~df["entry_details"].str.contains(
            "com.systemone.lc2.estimateshandling.dto.EstimateWeightsDTO", na=False
        )
    )  # zyxw
    # & (
    #     ~df["entry_details"].str.contains("STATUS LOADING_INSTRUCTION", na=False)
    # )  # STATUS LOZYING_INSTRUCTION also included
    # & (~df["entry_details"].str.contains("STATUS LOZYING_INSTRUCTION", na=False))
    # & (
    #     ~df["entry_details"].str.contains("Email receivers", na=False)
    # )  # don't include this
    # & (
    #     ~df["entry_details"].str.contains("Telex receivers", na=False)
    # )  # don't include this
]["entry_details"].unique()