***
### Import of required libraries
***

In [None]:
import random
from pickle import dump

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from traffic.core import Traffic

***
### Import of data
***

In [None]:
t = Traffic.from_file(
    "/mnt/beegfs/store/krum/MT/traffic_complete/complete.parquet"
)

***
### One-hot encoding of typecode and SID
***

In [None]:
# Definition of columns to be encoded
categorical_variables = ["typecode", "SID"]

# Definition and fitting of the encoder
enc = OneHotEncoder()
enc.fit(t.data[categorical_variables])

# Encoding of the columns and adding them to the DataFrame
encoded_variables = enc.transform(t.data[categorical_variables]).toarray()
new_column_names = enc.get_feature_names_out(categorical_variables)
encoded_df = pd.DataFrame(
    encoded_variables, columns=new_column_names, index=t.data.index
)
t.data = pd.concat([t.data, encoded_df], axis=1)

# Print overview of the columns
t.data.columns

***
### Cyclic encoding of temporal features (hour, weekday, month)
***

In [None]:
# Function to apply encoding to one column
def cyclic_enc(df, feature, max_val):
    df[feature + "_sin"] = (
        0.5 * np.sin(2 * np.pi * df[feature] / max_val) + 0.5
    )
    df[feature + "_cos"] = (
        0.5 * np.cos(2 * np.pi * df[feature] / max_val) + 0.5
    )
    return df


# Encoding of hour, weekday and month
t.data = cyclic_enc(t.data, "hour", 24)
t.data = cyclic_enc(t.data, "weekday", 7)
t.data = cyclic_enc(t.data, "month", 12)

# Print overview of the columns
t.data.columns

***
### Min-max scaling of numerical features
***

In [None]:
# Definition of columns to be scaled
num_columns = [
    "latitude",
    "longitude",
    "altitude",
    "toff_weight_kg",
    "wind_x_2min_avg",
    "wind_y_2min_avg",
    "temperature_gnd",
    "humidity_gnd",
    "pressure_gnd",
]

# Definition and fitting of the scaler
scaler = MinMaxScaler()
scaler.fit(t.data[num_columns])

# Scaling of the columns and adding them to the DataFrame
scaled_values = scaler.transform(t.data[num_columns])
scaled_df = pd.DataFrame(
    scaled_values, columns=num_columns, index=t.data.index
)
scaled_df.columns = [col + "_scaled" for col in scaled_df.columns]
t.data = pd.concat([t.data, scaled_df], axis=1)

# Definition of fitting of second scaler for output variables only. This
# simplifies reverse scaling of the output variables later on.
output_var = ["latitude", "longitude", "altitude"]
scaler_out = MinMaxScaler()
scaler_out.fit(t.data[output_var])

# Save the scaler for later use
with open(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/scaler_in.pkl",
    "wb",
) as f:
    dump(scaler, f)
with open(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/scaler_out.pkl",
    "wb",
) as f:
    dump(scaler_out, f)

***
### Splitting the data into training, validation and test sets
***

In [None]:
# Generate list of all flight id's in the dataset
ids = t.data.flight_id.unique().tolist()

# Randomly shuffle the list
random.shuffle(ids)

# Define indices to split the list into training, validation and test set
size = len(ids)
first_split = int(0.6 * size)
second_split = first_split + int(0.2 * size)

# Define the id's for the training, validation and test sets
ids_train = ids[:first_split]
ids_val = ids[first_split:second_split]
ids_test = ids[second_split:]

# Split the data accordingly
t_train = t[ids_train]
t_val = t[ids_val]
t_test = t[ids_test]

# Save the data
t_train.to_parquet(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_train.parquet"
)
t_val.to_parquet(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_val.parquet"
)
t_test.to_parquet(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_test.parquet"
)