In [None]:
import pandas as pd
from pathlib import Path
import joblib, json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from tqdm import tqdm

import sys
sys.path.append('./rtaUtils')

from rtaUtils import data_loading, paths

In [None]:
default_operator = 'none'
with open(Path('./data/airlines.json')) as file:
    operators = pd.DataFrame.from_dict(json.load(file)['rows']).ICAO.values.tolist() + [default_operator]
sky_statuses = ['BKN', 'CAVOK', 'FEW', 'NSC', 'SCT', 'OVC']
airport_list = ('LEBL','LPPT','LFPO','LFPG','EGLL','LPPR','EDDF','EBBR','EHAM','LEBB','LIRF',
                'EDDM','EGKK','LEVC','LSGG','LECO','LIMC','EIDW','LFML','LEZL','LOWW','LIPZ',
                'LIPE','LFLL','EDDL','EDDB','LTFM','LFMN','LFRS','LROP','EDDP','LGAV','EDDH',
                'LHBP','EKCH','EGCC','ELLX','LKPR','LIRN','LBSF','EPWA')

# Feature selection

In [None]:
### Features ##################################################################
numeric_feat   = ['latitude', 'longitude', 'altitude', 
                  'vspeed', 'speed', 'day_of_week', 'track', 'wind_dir_degrees', 
                  'wind_speed_kt', 'visibility_statute_mi', 'max_temp', 'min_temp',
                  'clouds', 'hav_distance']
categoric_feat = ['operator', 'sky_status']
objective      = ['RTA']
num_features   = len(numeric_feat+categoric_feat)

# Encoder and scaler generation

In [None]:
month    = '20220[123456789]'
airports = ''

In [None]:
data_copy = data_loading.load_clean_data(month, airports, numeric_feat+categoric_feat+objective) # operators

# Categoric to numeric conversion
encoders = {}
for feat in categoric_feat:
    le = LabelEncoder()
    if feat == 'operator': 
        # le = le.fit(operators)
        # data_copy[feat] = le.transform(data_copy[feat]).reshape(-1,1)
        data_copy[feat] = le.fit_transform(data_copy[feat]).reshape(-1,1)
    elif feat == 'sky_status': 
        le = le.fit(sky_statuses)
        data_copy[feat] = le.transform(data_copy[feat]).reshape(-1,1)
    elif feat == 'aerodromeOfDeparture':
        le = le.fit(airport_list)
        data_copy[feat] = le.transform(data_copy[feat]).reshape(-1,1)
    else:
        data_copy[feat] = le.fit_transform(data_copy[feat]).reshape(-1,1)
    encoders[feat]  = le

# Normalization to [0,1] range
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(data_copy[numeric_feat+categoric_feat+objective])

del data_copy

joblib.dump(encoders, paths.utils_path / 'encoder.joblib') 
joblib.dump(scaler, paths.utils_path   / 'scaler_{}.joblib'.format(num_features));

# Stratified data separation

In [None]:
train_ratio  = 0.85
test_ratio   = 1 - train_ratio
val_ratio    = 0.15

In [None]:
def generate_flight_stats(data: pd.DataFrame) -> pd.DataFrame:
    # REVISAR. Sobran cosas #########################
    flights = data.groupby(['fpId']).agg(to_min=('actualTakeOffTime','min'), 
                                         to_max=('actualTakeOffTime','max'),
                                         # fd_min=('flightDate','min'), 
                                         # fd_max=('flightDate','max'),
                                         airportOfOrigin=('aerodromeOfDeparture','max')
                                        )
    # flights['week'] = pd.to_datetime(flights.fd_max).dt.isocalendar().week.values
    # Determine the period of time 
    # flights['to_min'] = pd.to_datetime(flights.to_min, unit = 's').apply(assign_time_of_day)
    # flights['to_max'] = pd.to_datetime(flights.to_max, unit = 's').apply(assign_time_of_day)
    
    return flights

def stratify_data(data: pd.DataFrame, flights: pd.DataFrame) -> pd.DataFrame:
    train, test = train_test_split(flights, train_size=train_ratio, random_state=42,
                                   stratify=flights[['airportOfOrigin']], shuffle=True) # , 'week'
    print(f'Trayectorias totales    {flights.shape[0]:>4}')
    
    train_data   = data[data.fpId.isin(train.index)].sort_values(['fpId', 'timestamp']).reset_index(drop=True)
    test_data    = data[data.fpId.isin(test.index)].sort_values(['fpId', 'timestamp']).reset_index(drop=True)

    train, validation = train_test_split(train, train_size=1-val_ratio, random_state=42, 
                                         stratify=train[['airportOfOrigin']], shuffle=True) 
    val_data   = train_data[train_data.fpId.isin(validation.index)].sort_values(['fpId', 'timestamp']).reset_index(drop=True)
    train_data = train_data[train_data.fpId.isin(train.index)].sort_values(['fpId', 'timestamp']).reset_index(drop=True)
    
    print(f'   Trayectorias train   {train.shape[0]:>4}')
    print(f'   Trayectorias test    {test.shape[0]:>4}')
    print(f'   Trayectorias val     {validation.shape[0]:>4}')

    return train_data, test_data, val_data

In [None]:
# Ignorar loops
sort_stats = pd.read_csv('rtaUtils/sort_stats.csv').drop_duplicates(subset='fpId', keep='last')
allowed_flights = sort_stats[sort_stats.rotacion_maniobra<400]

encoders = joblib.load(paths.utils_path / 'encoder.joblib')
operators = encoders['operator']
operators = operators.classes_

In [None]:
months = [f'2022{str(x).rjust(2,"0")}' for x in range(10,11)]
airports = '*'
for month in months:
    print(month)
    data = data_loading.load_clean_data(month, airports)
    
    ##########
    data = data[data.fpId.isin(allowed_flights.fpId)]
    data = data[data.operator.isin(operators)]
    ##########
    
    flights = generate_flight_stats(data)
    
    train_data, test_data, val_data = stratify_data(data, flights)

    train_data.to_parquet(paths.final_data_path / f'{month}.train.parquet')
    test_data.to_parquet(paths.final_data_path / f'{month}.test.parquet')
    val_data.to_parquet(paths.final_data_path / f'{month}.val.parquet')
    
#     data.to_parquet(paths.final_data_path / f'{month}.test.parquet')