In [7]:
import pandas as pd
import json

In [11]:
# Datei einlesen
# df = pd.read_parquet("../data/ABCD_tripfiles_conv.parquet")
# df = pd.read_parquet("../data/MNOP_tripfiles_conv.parquet")
df = pd.read_parquet("../data/ZYXW_tripfiles_conv.parquet")

In [12]:
# Hilfsfunktion zum Extrahieren von Daten aus JSON-Spalten
def extract_json_data(row, column, keys):
    try:
        if pd.notna(row[column]):
            data = json.loads(row[column])
            for key in keys:
                data = data.get(key, None)
            return data
        return None
    except json.JSONDecodeError:
        return None

# Liste der neuen Spalten und die zugehörigen JSON-Schlüssel und Spalten
new_columns = {
    'estimated_Y': ('data_EstimateStorePaxDataAction', ['estimated_Y']),
    'estimated_Jump': ('data_EstimateStorePaxDataAction', ['estimated_Jump']),
    'estimated_Standby': ('data_EstimateStorePaxDataAction', ['estimated_Standby']),
    'estimated_Male': ('data_EstimateStorePaxDataAction', ['estimated_Male']),
    'estimated_Female': ('data_EstimateStorePaxDataAction', ['estimated_Female']),
    'estimated_Child': ('data_EstimateStorePaxDataAction', ['estimated_Child']),
    'estimated_Infant': ('data_EstimateStorePaxDataAction', ['estimated_Infant']),
    'estimated_Bags': ('data_EstimateStorePaxDataAction', ['estimated_Bags']),
    'aircraft_regTailNbr': ('data_CheckinMsgProcessor', ['aircraft_regTailNbr']),
    'aircraft_Type': ('data_CheckinMsgProcessor', ['aircraft_Type']),
    'aircraft_configuration': ('data_CheckinMsgProcessor', ['aircraft_configuration']),
    'airline': ('data_CreateZFWMessageAction', ['airline']),
    'arrivalStation': ('data_CreateZFWMessageAction', ['arrivalStation']),
    'departureStation': ('data_CreateZFWMessageAction', ['departureStation']),
    'flightDateLocal': ('data_CreateZFWMessageAction', ['flightDateLocal']),
    'revisionNumber': ('data_CreateZFWMessageAction', ['revisionNumber']),
    'PAX': ('data_StorePaxDataAction', ['PAX']),
    'Y': ('data_StorePaxDataAction', ['Y']),
    'Jump': ('data_StorePaxDataAction', ['Jump']),
    'Standby': ('data_StorePaxDataAction', ['Standby']),
    'Male': ('data_StorePaxDataAction', ['Male']),
    'Female': ('data_StorePaxDataAction', ['Female']),
    'Infant': ('data_StorePaxDataAction', ['Infant']),
    'Bags': ('data_StorePaxDataAction', ['Bags']),
    # 'Flight_Number': ('data_CreateLoadingInstructionAction', ['Flight_Number']),
    # 'Flight_Date': ('data_CreateLoadingInstructionAction', ['Flight_Date']),
    'Flight_Route_From': ('data_CreateLoadingInstructionAction', ['Flight_Route', 'From']),
    'Flight_Route_To': ('data_CreateLoadingInstructionAction', ['Flight_Route', 'To']),
}

# Neue Spalten erstellen und Daten extrahieren
for new_col, (json_col, json_keys) in new_columns.items():
    df[new_col] = df.apply(lambda row: extract_json_data(row, json_col, json_keys), axis=1)

# flightid Spalte erstellen
df['flightid'] = df['airline_code'].astype(str) + '_' + df['flight_number'].astype(str) + '_' + df['flight_date'].astype(str) + '_' + df['departure_airport'].astype(str)

# Überprüfen der vorhandenen Spalten im DataFrame
existing_columns = set(df.columns)

# Erstellen der Aggregations-Dictionary unter Berücksichtigung der vorhandenen Spalten
agg_dict = {col: 'last' for col in new_columns.keys() if col in existing_columns}
additional_columns = [
    'airline_code', 'flight_number', 'flight_suffix', 'flight_date', 'departure_airport'
]
agg_dict.update({col: 'last' for col in additional_columns if col in existing_columns})

# flight_suffix-Spalte direkt aus der Parquet-Datei übernehmen
df_agg = df.groupby('flightid').agg(agg_dict).reset_index()


In [13]:
# NULL-Werte durch leere Zeichenfolgen ersetzen
# df_agg.fillna("", inplace=True)

# Resultierende Tabelle als Parquet speichern
# df_agg.to_parquet("../data/ABCD_flighttable.parquet", index=False)
# df_agg.to_parquet("../data/MNOP_flighttable.parquet", index=False)
df_agg.to_parquet("../data/ZYXW_flighttable.parquet", index=False)

print("Neue Spalten erfolgreich erstellt und in Datei gespeichert.")


Neue Spalten erfolgreich erstellt und in Datei gespeichert.


In [4]:
# Datei einlesen
df_flight = pd.read_parquet("../data/ABCD_flighttable.parquet")
df_flight

Unnamed: 0,flightid,estimated_Y,estimated_Jump,estimated_Standby,estimated_Male,estimated_Female,estimated_Child,estimated_Infant,estimated_Bags,aircraft_regTailNbr,...,Infant,Bags,Flight_Number,Flight_Date,Flight_Route_From,Flight_Route_To,airline_code,flight_number,flight_date,departure_airport
0,AB_1070_15_BOM,,,,,,,,,,...,,,,,,,AB,1070,15,BOM
1,AB_1070_16_BOM,,,,,,,,,,...,,,,,,,AB,1070,16,BOM
2,AB_1070_17_BOM,,,,,,,,,,...,,,,,,,AB,1070,17,BOM
3,AB_1070_18_BOM,,,,,,,,,,...,,,,,,,AB,1070,18,BOM
4,AB_1070_19_BOM,,,,,,,,,,...,,,,,,,AB,1070,19,BOM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2746,AB_8119_6_BBI,,,,,,,,,,...,,160,,,,,AB,8119,6,BBI
2747,AB_8119_6_GAU,,,,,,,,,,...,,160,,,GAU,CCU,AB,8119,6,GAU
2748,AB_8119_6_LKO,100,,0,87,10,3,0,160,,...,0,160,,,LKO,CCU,AB,8119,6,LKO
2749,AB_8119_7_VNS,100,,,87,10,3,0,0,,...,0,202,,,VNS,CCU,AB,8119,7,VNS
