In [4]:
import json
import re
import io

import pandas as pd
from pathlib import Path

from utils import *

In [120]:
# Define the file paths

parquet_file_abcd_conv = Path("../data/ABCD_tripfiles_conv.parquet")
parquet_file_mnop_conv = Path("../data/MNOP_tripfiles_conv.parquet")
parquet_file_zyxw_conv = Path("../data/ZYXW_tripfiles_conv.parquet")
print(
    parquet_file_abcd_conv,
    parquet_file_mnop_conv,
    parquet_file_zyxw_conv,
)

..\data\ABCD_tripfiles_conv.parquet ..\data\MNOP_tripfiles_conv.parquet ..\data\ZYXW_tripfiles_conv.parquet


In [121]:
ab = pd.read_parquet(parquet_file_abcd_conv)
mn = pd.read_parquet(parquet_file_mnop_conv)
zy = pd.read_parquet(parquet_file_zyxw_conv)

In [5]:
def print_structure(data, indent=1, output=None, toplvl=True):

    if output is None:
        output = io.StringIO()

    """
    Recursively prints the structure of a dictionary or list, showing types and keys.
    """

    indent_str = "  " * indent
    print(indent_str[2:], "{", file=output)

    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                print(f"{indent_str}'{key}':", file=output)
                print_structure(value, indent + 1, output=output, toplvl=False)
            else:
                print(f"{indent_str}'{key}'", file=output)
    elif isinstance(data, list):
        if len(data) > 0:
            print(f"{indent_str}[", file=output)
            for item in data[:3]:  # Print first 3 items only for brevity
                # print(f"{indent_str}[{index}]")
                print_structure(item, indent + 1, output=output, toplvl=False)
            print(f"{indent_str}]", file=output)
        else:
            print(f"{indent_str}[]]", file=output)
    else:
        print(f"{indent_str}{data} ({type(data).__name__})", file=output)
    print(indent_str[2:], "},", file=output)
    if toplvl:
        contents = output.getvalue()
        output.close()
        return contents


# Print the structure of the example dictionary

In [7]:
foo = {
    "TOTAL TRAFFIC LOAD": 12997,
    "DRY OPERATING WEIGHT": 44966,
    "ZERO FUEL WEIGHT ACTUAL": 57963,
    "TAKE OFF FUEL": 5514,
    "TAKE OFF WEIGHT ACTUAL": 63477,
    "TRIP": 2212,
    "LANDING WEIGHT ACTUAL": 61265,
    "TAXI FUEL": 286,
    "PANTRY": 414,
}
bar = print_structure(foo)
print(bar)

 {
  'TOTAL TRAFFIC LOAD'
  'DRY OPERATING WEIGHT'
  'ZERO FUEL WEIGHT ACTUAL'
  'TAKE OFF FUEL'
  'TAKE OFF WEIGHT ACTUAL'
  'TRIP'
  'LANDING WEIGHT ACTUAL'
  'TAXI FUEL'
  'PANTRY'
 },



# Parse Data

In [123]:
# Define the action here

action = "TransferCargoAction"

In [124]:
# All the data will be parsed and loaded from the data column 
ab_action = ab[(ab.action_name == action) & ~(ab[f"data_{action}"]).isna()][
    f"data_{action}"
].apply(json.loads)


mn_action = mn[(mn.action_name == action) & ~(mn[f"data_{action}"]).isna()][
    f"data_{action}"
].apply(json.loads)


zy_action = zy[(zy.action_name == action) & ~(zy[f"data_{action}"]).isna()][
    f"data_{action}"
].apply(json.loads)

In [125]:
# Here a new dataframe is generated containing the structure of the data. If this doesn't work you have to adjust the function print_structure.
ab_action_data = pd.DataFrame()
ab_action_data["data"] = ab_action.apply(print_structure)

mn_action_data = pd.DataFrame()
mn_action_data["data"] = mn_action.apply(print_structure)

zy_action_data = pd.DataFrame()
zy_action_data["data"] = zy_action.apply(print_structure)

In [132]:
# Set df to one of the dataframes
# Go through every dataframe and check the unique structures with the following code
df = mn_action_data

In [133]:
# Now every unique data structure is printed. 
if len(df) > 0:
    for item in df.data.unique():
        print(repr(item))
        print()

" {\n  'LoadDTO':\n   {\n    'id'\n    'flightId'\n    'legId'\n    'deleted'\n    'fragmentId'\n   },\n  'BULK':\n   {\n    []]\n   },\n  'ULD':\n   {\n    [\n     {\n      'Item Id'\n      'ULD'\n      'Number'\n      'Airline'\n      'Special'\n      'Status'\n      'Weight'\n      'Weight unit'\n      'Pieces'\n      'Volume'\n      'Origin'\n      'Onl.Sta.'\n      'Dest.'\n      'Category'\n      'Position'\n      'Confirmed'\n      'Remarks'\n     },\n     {\n      'Item Id'\n      'ULD'\n      'Number'\n      'Airline'\n      'Special'\n      'Status'\n      'Weight'\n      'Weight unit'\n      'Pieces'\n      'Volume'\n      'Origin'\n      'Onl.Sta.'\n      'Dest.'\n      'Category'\n      'Position'\n      'Confirmed'\n      'Remarks'\n     },\n    ]\n   },\n  'Totals':\n   {\n    'Total baggage'\n    'Total cargo'\n    'Total EIC'\n    'Total mail'\n   },\n },\n"

" {\n  'LoadDTO':\n   {\n    'id'\n    'flightId'\n    'legId'\n    'deleted'\n    'fragmentId'\n   },\n  'BULK

In [285]:
# Here as an example these were the unique structures for one of the actions from the three files. 
# As you can see the first three and the last two are the same. 

" {\n  'AIRBORNE'\n  'AIRCRAFT_CONFIG'\n  'ALLOWANCE_CHECK_PERFORMED'\n  'AUTOMATION_STARTED'\n  'AUTO_MODE_ACTIVE'\n  'BAG_LOAD_ITEMS_GEN'\n  'BAG_ULD_ORD'\n  'CABIN_CONFIG'\n  'CALC_HIST_DATA'\n  'CARGO_FINAL'\n  'CARGO_TRANSFER'\n  'CHECK_IN_FINAL'\n  'CHECK_IN_OPEN'\n  'DGR_ITEMS'\n  'EZFW'\n  'EZFW_COUNTER'\n  'FINAL_RELEASE'\n  'FUEL'\n  'FUEL_ORDER'\n  'LDM'\n  'LOADING_INSTRUCTION'\n  'LOADSHEET'\n  'OFFBLOCK'\n  'OFP'\n  'PDM'\n  'RAMP_FINAL'\n  'REGISTRATION'\n  'REGISTRATION_CHANGE'\n  'TRANSIT_ACCEPTANCE'\n  'TRANSIT_PAX'\n },\n"
" {\n  'AIRBORNE'\n  'AIRCRAFT_CONFIG'\n  'ALLOWANCE_CHECK_PERFORMED'\n  'AUTOMATION_STARTED'\n  'AUTO_MODE_ACTIVE'\n  'BAG_LOAD_ITEMS_GEN'\n  'BAG_ULD_ORD'\n  'CABIN_CONFIG'\n  'CALC_HIST_DATA'\n  'CARGO_FINAL'\n  'CARGO_TRANSFER'\n  'CHECK_IN_FINAL'\n  'CHECK_IN_OPEN'\n  'DGR_ITEMS'\n  'EZFW'\n  'EZFW_COUNTER'\n  'FINAL_RELEASE'\n  'FUEL'\n  'FUEL_ORDER'\n  'LDM'\n  'LOADING_INSTRUCTION'\n  'LOADSHEET'\n  'OFFBLOCK'\n  'OFP'\n  'PDM'\n  'RAMP_FINAL'\n  'REGISTRATION'\n  'REGISTRATION_CHANGE'\n  'TRANSIT_ACCEPTANCE'\n  'TRANSIT_PAX'\n },\n"
" {\n  'AIRBORNE'\n  'AIRCRAFT_CONFIG'\n  'ALLOWANCE_CHECK_PERFORMED'\n  'AUTOMATION_STARTED'\n  'AUTO_MODE_ACTIVE'\n  'BAG_LOAD_ITEMS_GEN'\n  'BAG_ULD_ORD'\n  'CABIN_CONFIG'\n  'CALC_HIST_DATA'\n  'CARGO_FINAL'\n  'CARGO_TRANSFER'\n  'CHECK_IN_FINAL'\n  'CHECK_IN_OPEN'\n  'DGR_ITEMS'\n  'EZFW'\n  'EZFW_COUNTER'\n  'FINAL_RELEASE'\n  'FUEL'\n  'FUEL_ORDER'\n  'LDM'\n  'LOADING_INSTRUCTION'\n  'LOADSHEET'\n  'OFFBLOCK'\n  'OFP'\n  'PDM'\n  'RAMP_FINAL'\n  'REGISTRATION'\n  'REGISTRATION_CHANGE'\n  'TRANSIT_ACCEPTANCE'\n  'TRANSIT_PAX'\n },\n"

" {\n  'Discrepancy check result':\n   {\n    'Discrepancy happened'\n   },\n  'Discrepancies':\n   {\n    [\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n    ]\n   },\n },\n"
" {\n  'Discrepancy check result':\n   {\n    'Discrepancy happened'\n   },\n  'Discrepancies':\n   {\n    [\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n     {\n      'Type'\n      'Destination'\n      'Bag pieces'\n      'Bag weight'\n     },\n    ]\n   },\n },\n"




" {\n  'AIRBORNE'\n  'AIRCRAFT_CONFIG'\n  'ALLOWANCE_CHECK_PERFORMED'\n  'AUTOMATION_STARTED'\n  'AUTO_MODE_ACTIVE'\n  'BAG_LOAD_ITEMS_GEN'\n  'BAG_ULD_ORD'\n  'CABIN_CONFIG'\n  'CALC_HIST_DATA'\n  'CARGO_FINAL'\n  'CARGO_TRANSFER'\n  'CHECK_IN_FINAL'\n  'CHECK_IN_OPEN'\n  'DGR_ITEMS'\n  'EZFW'\n  'EZFW_COUNTER'\n  'FINAL_RELEASE'\n  'FUEL'\n  'FUEL_ORDER'\n  'LDM'\n  'LOADING_INSTRUCTION'\n  'LOADSHEET'\n  'OFFBLOCK'\n  'OFP'\n  'PDM'\n  'RAMP_FINAL'\n  'REGISTRATION'\n  'REGISTRATION_CHANGE'\n  'TRANSIT_ACCEPTANCE'\n  'TRANSIT_PAX'\n },\n"

In [103]:
i = 0

In [114]:
# You can now print the desired structure without \n ignored so that you can paste it into excel.


print(len(df.data.unique()))

print(df.data.unique()[i])
i += 1

10


IndexError: index 10 is out of bounds for axis 0 with size 10

## Extract all keywords

This can be used to extract all the keywords from e.g. this:

"""Pax Weight = 11850.0 KG Bag Weight = 1920.0 KG Cargo = 56.1 KG Mail = 0.0 KG DOW = 44584.0 KG ZFW = 56434.0 KG\r\n
            STATUS FUEL 2 AIRCRAFT_CONFIG 1 EZFW 2 CARGO_TRANSFER 1 CABIN_CONFIG 1 CALC_HIST_DATA 1 AUTO_MODE_ACTIVE 1 AUTOMATION_STARTED 0 BAG_LOAD_ITEMS_GEN 1 EZFW_COUNTER 2 REGISTRATION 1 REGISTRATION_CHANGE 2',"""

This is helpfull if you realize that there are a lot of unique structures because some keywords are missing sometimes. 
I used this to adjust the extract function of the action so that it always includes every keyword even if it's not in the message (it fills it with null)

In [46]:
s = set()

In [50]:
# Set df to one of the dataframes
# Go through every dataframe and check the unique structures with the following code
df = mn_action_data

In [53]:
df.data.unique()

array([" {\n  'start_weight'\n  'start_index'\n  'crew'\n  'water(%)'\n  'Galley_Codes':\n   {\n    [\n     {\n      'TTL'\n      'Weight'\n      'unit'\n      'Index'\n     },\n     {\n      'TTL'\n      'Weight'\n      'unit'\n      'Index'\n     },\n    ]\n   },\n  'Corrections':\n   {\n    [\n     {\n      'DOW'\n      'Unit'\n      'DOI'\n      'Pos'\n      'Remark'\n     },\n     {\n      'DOW'\n      'Unit'\n      'DOI'\n      'Pos'\n      'Remark'\n     },\n    ]\n   },\n  'Weight_Tables':\n   {\n    [\n     {\n      'Table'\n      'Airline'\n      'Weight'\n      'Unit'\n     },\n     {\n      'Table'\n      'Airline'\n      'Weight'\n      'Unit'\n     },\n     {\n      'Table'\n      'Airline'\n      'Weight'\n      'Unit'\n     },\n    ]\n   },\n  'Max_Weights':\n   {\n    [\n     {\n      'Type'\n      'Left_Value'\n      'Left_Unit'\n      'Right_Value'\n      'Right_Unit'\n     },\n     {\n      'Type'\n      'Left_Value'\n      'Left_Unit'\n      'Right_Value'\n      'R

In [66]:
bar = set()

In [73]:
df = zy_action

In [74]:
for i in df:
    # print(i["Status"])
    # foo =
    bar.update(re.findall(r"'(.*?)'", str(i["Status"])))
    # print(foo)

In [75]:
bar

{'AIRCRAFT_CONFIG',
 'ALLOWANCE_CHECK_PERFORMED',
 'AUTOMATION_STARTED',
 'AUTO_MODE_ACTIVE',
 'BAG_LOAD_ITEMS_GEN',
 'BAG_ULD_ORD',
 'CABIN_CONFIG',
 'CALC_HIST_DATA',
 'CARGO_FINAL',
 'CARGO_TRANSFER',
 'CHECK_IN_FINAL',
 'CHECK_IN_OPEN',
 'DGR_ITEMS',
 'EZFW',
 'EZFW_COUNTER',
 'FINAL_RELEASE',
 'FUEL',
 'FUEL_ORDER',
 'LOADING_INSTRUCTION',
 'LOADSHEET',
 'OFFBLOCK',
 'OFP',
 'RAMP_FINAL',
 'REGISTRATION',
 'REGISTRATION_CHANGE',
 'TRANSIT_ACCEPTANCE',
 'TRANSIT_PAX'}

In [55]:
for item in df.data.unique()[1:]:
    s.update(re.findall(r"'(.*?)'", item))

TypeError: string indices must be integers, not 'str'

In [52]:
print(sorted(list(s)))

['AIRCRAFT_CONFIG', 'ALLOWANCE_CHECK_PERFORMED', 'AUTOMATION_STARTED', 'AUTO_MODE_ACTIVE', 'Airline', 'BAG_LOAD_ITEMS_GEN', 'BAG_ULD_ORD', 'CABIN_CONFIG', 'CALC_HIST_DATA', 'CARGO_FINAL', 'CARGO_TRANSFER', 'CHECK_IN_FINAL', 'CHECK_IN_OPEN', 'Corrections', 'DGR_ITEMS', 'DOI', 'DOW', 'EZFW', 'EZFW_COUNTER', 'FINAL_RELEASE', 'FUEL', 'FUEL_ORDER', 'Galley_Codes', 'Index', 'LOADING_INSTRUCTION', 'LOADSHEET', 'Left_Unit', 'Left_Value', 'Max_Weights', 'OFFBLOCK', 'OFP', 'Pos', 'RAMP_FINAL', 'REGISTRATION', 'REGISTRATION_CHANGE', 'Remark', 'Right_Unit', 'Right_Value', 'Status', 'TRANSIT_ACCEPTANCE', 'TRANSIT_PAX', 'TTL', 'Table', 'Type', 'Unit', 'Weight', 'Weight_Tables', 'crew', 'start_index', 'start_weight', 'unit', 'water(%)']


In [35]:
print(sorted(list(s)))

['AIRCRAFT_CONFIG', 'ALLOWANCE_CHECK_PERFORMED', 'AUTOMATION_STARTED', 'AUTO_MODE_ACTIVE', 'Airline', 'BAG_LOAD_ITEMS_GEN', 'BAG_LOZY_ITEMS_GEN', 'CABIN_CONFIG', 'CALC_HIST_DATA', 'CARGO_FINAL', 'CARGO_TRANSFER', 'CHECK_IN_OPEN', 'Corrections', 'DGR_ITEMS', 'EZFW', 'EZFW_COUNTER', 'FINAL_RELEASE', 'FUEL', 'Galley_Codes', 'Index', 'LOADING_INSTRUCTION', 'LOADSHEET', 'LOZYING_INSTRUCTION', 'LOZYSHEET', 'Left_Unit', 'Left_Value', 'Max_Weights', 'OFFBLOCK', 'RAMP_FINAL', 'REGISTRATION', 'REGISTRATION_CHANGE', 'Right_Unit', 'Right_Value', 'Status', 'TRANSIT_ACCEPTANCE', 'TRANSIT_PAX', 'TTL', 'Table', 'Type', 'Unit', 'Weight', 'Weight_Tables', 'crew', 'start_index', 'start_weight', 'unit', 'water(%)']


In [None]:
['AIRCRAFT_CONFIG', 'AUTOMATION_STARTED', 'AUTO_MODE_ACTIVE', 'Airline', 'BAG_LOAD_ITEMS_GEN', 'CABIN_CONFIG', 'CALC_HIST_DATA', 'CARGO_FINAL', 'CARGO_TRANSFER', 'CHECK_IN_OPEN', 'Corrections', 'EZFW', 'EZFW_COUNTER', 'FINAL_RELEASE', 'FUEL', 'Galley_Codes', 'Index', 'LOADING_INSTRUCTION', 'LOADSHEET', 'Left_Unit', 'Left_Value', 'Max_Weights', 'OFFBLOCK', 'RAMP_FINAL', 'REGISTRATION', 'REGISTRATION_CHANGE', 'Right_Unit', 'Right_Value', 'Status', 'TRANSIT_ACCEPTANCE', 'TRANSIT_PAX', 'TTL', 'Table', 'Type', 'Unit', 'Weight', 'Weight_Tables', 'crew', 'start_index', 'start_weight', 'unit', 'water(%)']
['AIRCRAFT_CONFIG', 'ALLOWANCE_CHECK_PERFORMED', 'AUTOMATION_STARTED', 'AUTO_MODE_ACTIVE', 'Airline', 'BAG_LOAD_ITEMS_GEN', 'BAG_ULD_ORD', 'CABIN_CONFIG', 'CALC_HIST_DATA', 'CARGO_FINAL', 'CARGO_TRANSFER', 'CHECK_IN_FINAL', 'CHECK_IN_OPEN', 'Corrections', 'DGR_ITEMS', 'DOI', 'DOW', 'EZFW', 'EZFW_COUNTER', 'FINAL_RELEASE', 'FUEL', 'FUEL_ORDER', 'Galley_Codes', 'Index', 'LOADING_INSTRUCTION', 'LOADSHEET', 'Left_Unit', 'Left_Value', 'Max_Weights', 'OFFBLOCK', 'OFP', 'Pos', 'RAMP_FINAL', 'REGISTRATION', 'REGISTRATION_CHANGE', 'Remark', 'Right_Unit', 'Right_Value', 'Status', 'TRANSIT_ACCEPTANCE', 'TRANSIT_PAX', 'TTL', 'Table', 'Type', 'Unit', 'Weight', 'Weight_Tables', 'crew', 'start_index', 'start_weight', 'unit', 'water(%)']
