In [1]:
import pandas as pd
from pycelonis import get_celonis
from pycelonis_core.utils.errors import PyCelonisNotFoundError
import json

In [2]:
AIRLINE_CODE = "ZYXW"

### Preprocessing

In [3]:
# Read in the MNOP data
df_airline = pd.read_parquet(f"../data/{AIRLINE_CODE}_tripfiles_conv.parquet", engine="pyarrow")

In [4]:
# create a flightid column that is a combination of airline_code, flight_number, flight_date and departure_airport
df_airline["flightid"] = df_airline.apply(
    lambda x: f"{x['airline_code']}_{x['flight_number']}_{x['flight_date']}_{x['departure_airport']}",
    axis=1,
)
df_airline = df_airline.drop_duplicates(subset=list(df_airline.columns).remove("id"))
# drop all columns that are not needed for the analysis
df_airline.drop(
    columns=[
        "id",
        "airline_code",
        "flight_number",
        "flight_date",
        "departure_airport",
        "header_line",
        "entry_details",
    ],
    inplace=True,
)

# drop all duplicates from the MNOP dataframe not considering the id

In [5]:
# make creation_time a datetime object
df_airline["creation_time"] = pd.to_datetime(df_airline["creation_time"])
# df_airline = df_airline.convert_dtypes()

### Celonis client

In [12]:
# read in the celonis_credentials.json file
with open("celonis_credentials.json") as file:
    credentials = json.load(file)

# Get the Celonis object
celonis = get_celonis(base_url=credentials["url"], api_token=credentials["api_token"], key_type="APP_KEY")

In [7]:
# get the data pool
data_pool = celonis.data_integration.get_data_pools().find("AeroMetrics")
data_pool

DataPool(id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4', name='AeroMetrics')

In [8]:
# upload the table
column_config = [{"columnName": "creation_time", "columnType": "DATETIME"}]
data_pool.create_table(df=df_airline, table_name=AIRLINE_CODE, drop_if_exists=True, column_config=column_config)

DataPoolTable(name='ZYXW', data_source_id=None, columns=[], schema_name='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4', data_pool_id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4')

In [9]:
data_pool.get_tables()

[
	DataPoolTable(name='ABCD', data_source_id=None, columns=[], schema_name='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4', data_pool_id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4'),
	DataPoolTable(name='MNOP', data_source_id=None, columns=[], schema_name='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4', data_pool_id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4'),
	DataPoolTable(name='ZYXW', data_source_id=None, columns=[], schema_name='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4', data_pool_id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4')
]

In [10]:
try:
    data_model = data_pool.get_data_models().find(AIRLINE_CODE)
except PyCelonisNotFoundError:
    data_model = data_pool.create_data_model(AIRLINE_CODE)
    data_model.add_table(name=AIRLINE_CODE, alias=AIRLINE_CODE)
data_model

DataModel(id='6fe0b9ad-8b16-4b21-a432-6008cee58809', name='ZYXW', pool_id='f90faeed-3a18-41f0-bf5e-a30f7c40fdc4')