***
### Import of required libraries
***

In [None]:
from traffic.core import Traffic
import numpy as np
from multiprocessing import Pool
from tqdm.auto import tqdm

***
### Import of data
***

In [None]:
t_train = Traffic.from_file(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_train.parquet"
)
t_val = Traffic.from_file(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_val.parquet"
)
t_test = Traffic.from_file(
    "/mnt/beegfs/store/krum/MT/encoded_scaled_split/t_test.parquet"
)

***
### Sample generation
***

In [None]:
# Function to generate samples for one flight
def process_flight(
    flight: Traffic,
    n_rows_input: int,
    n_rows_output: int,
    input_columns: list,
    out_columns: list,
    overlap: int,
    interval: int,
):
    """
    Generate samples for one flight

    Parameters
    ----------
    flight : Traffic.core.flight.Flight
        Flight to generate samples from
    n_rows_input : int
        Number of rows to include in the input (length of the input sequence)
    n_rows_output : int
        Number of rows to include in the output (length of the output sequence)
    input_columns : list
        List of columns to include in the input
    out_columns : list
        List of columns to include in the output
    overlap : int
        Number of timesteps to overlap between input and output
    interval : int
        Interval between output samples
    """
    # List to store the input and output samples
    inputs = []
    outputs = []
    # Extract the data for the flight
    flight_data = flight.data[input_columns]
    # Iterate over each possible starting point for the input sequence
    for i in range(0, len(flight.data) - n_rows_input - n_rows_output + 1):
        # Extract the input and output data
        input_data = flight_data.iloc[i : (i + n_rows_input)]
        output_data = flight_data.iloc[
            (i + n_rows_input - overlap) : (i + n_rows_input + n_rows_output) : interval
        ]
        # Append the input and output samples
        inputs.append(input_data.values)
        outputs.append(output_data[out_columns].values)
    # Return the input and output samples
    return np.array(inputs), np.array(outputs)


# Columns to include in the input
input_columns = [
    "latitude_scaled",
    "longitude_scaled",
    "altitude_scaled",
    "wind_x_2min_avg_scaled",
    "wind_y_2min_avg_scaled",
    "temperature_gnd_scaled",
    "humidity_gnd_scaled",
    "pressure_gnd_scaled",
    "toff_weight_kg_scaled",
    "typecode_A20N",
    "typecode_A21N",
    "typecode_A319",
    "typecode_A320",
    "typecode_A321",
    "typecode_A333",
    "typecode_A343",
    "typecode_B77W",
    "typecode_BCS1",
    "typecode_BCS3",
    "typecode_CRJ9",
    "typecode_DH8D",
    "typecode_E190",
    "typecode_E195",
    "typecode_E290",
    "typecode_E295",
    "typecode_F100",
    "typecode_SB20",
    "SID_DEGES",
    "SID_GERSA",
    "SID_VEBIT",
    "SID_ZUE",
    "hour_sin",
    "hour_cos",
    "weekday_sin",
    "weekday_cos",
    "month_sin",
    "month_cos",
]

# Columns to include in the output
output_columns = ["latitude_scaled", "longitude_scaled", "altitude_scaled"]

# Parameters for the generation of samples
t_in = 10
t_out = 180
interval_out = 5
overlap = 1

# Generate samples for the training, validation and test sets
for traffic_data, set_name in zip([t_train, t_val, t_test], ["train", "val", "test"]):
    # Parallelisation of the generation of samples
    chunks = [
        (
            flight,
            t_in,
            t_out,
            input_columns,
            output_columns,
            overlap,
            interval_out,
        )
        for flight in traffic_data
    ]
    with Pool(20) as p:
        results = p.starmap(process_flight, tqdm(chunks))

    # Concatenate the results
    inputs_list = [result[0] for result in results if result[0].size > 0]
    outputs_list = [result[1] for result in results if result[1].size > 0]
    inputs = np.concatenate(inputs_list, axis=0)
    outputs = np.concatenate(outputs_list, axis=0)

    # Save the samples
    np.save(
        f"/mnt/beegfs/store/krum/MT/samples/{set_name}_in32.npy",
        inputs.astype(np.float32),
    )
    np.save(
        f"/mnt/beegfs/store/krum/MT/samples/{set_name}_out32.npy",
        outputs.astype(np.float32),
    )

***
### Splitting into variable and constant features
***

In [None]:
# For each set (train, val, test)
for set in ["train", "val", "test"]:
    # Load the input samples
    arra_in = np.load(f"/mnt/beegfs/store/krum/MT/samples/{set}_in32.npy")
    # Split the input samples into variable and constant inputs
    in_var = train_in_tvar = arra_in[:, :, :8]
    in_con = train_in_tcon = arra_in[:, 0:1, 8:]
    # Save the separated input samples
    np.save(
        f"/mnt/beegfs/store/krum/MT/samples/{set}_in32_var.npy",
        in_var.astype(np.float32),
    )
    np.save(
        f"/mnt/beegfs/store/krum/MT/samples/{set}_in32_con.npy",
        in_con.astype(np.float32),
    )
    # Print the shape of the input samples
    print(f"Variable input shape {set}: {in_var.shape}")
    print(f"Constant input shape {set}: {in_con.shape}")