In [None]:
import pandas as pd
import numpy as np
from neuralprophet import NeuralProphet

import plotly.graph_objects as go
import plotly.express as px

from prophet import Prophet

#import matplotlib.pyplot as plt

## Lectura de los datos

In [None]:
df = pd.read_csv("flightsCleaned.csv")
pd.set_option('display.max_columns', None)

## Seleccionamos los datos

Queremos predecir los retrasos correspondientes con los 5 aeropuertos que experimentan mayor número de retrasos. En primer lugar debemos seleccionar dichos aeropuertos. Para ello calcularemos el número total de vuelos, el número total de vuelos retrasados y el % de vuelos retrasados para cada uno de los aeropuertos. Una vez hecho esto nos quedaremos con aquellos 5 que muestren un % de retrasos más elevado

In [None]:
airports = pd.DataFrame(df.groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count())
airports = airports.rename(columns={"FLIGHT_NUMBER":"TOTAL_FLIGHTS"})
airports["DELAYED_FLIGHTS"] =  df[df["ARRIVAL_DELAY"]>0].groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count()
airports["DELAY_PERCENTAGE"] = np.round(airports['DELAYED_FLIGHTS']/airports['TOTAL_FLIGHTS']*100,2)

# Ordenamos los aeropuertos de tal manera que los 5 con mayor % de retrasos aparezcan los primeros
airports = airports.sort_values('DELAY_PERCENTAGE',ascending=False)
airports = airports.reset_index(level=0, drop=False)
airports

In [None]:
# Seleccionamos el top 5
top_5_delay = airports.loc[0:4]
top_5_delay = top_5_delay['ORIGIN_AIRPORT']

# Filtramos nuestros datos para quedarnos únicamente con los correspondientes a dichos aeropuertos
df = df[df.ORIGIN_AIRPORT.isin(top_5_delay)]
list(top_5_delay)

In [None]:
df.head()

In [None]:
# Adjuntar mapa de los 5 aeropuertos


### Por qué no va?

In [None]:
# Arrival delay per airport
a1 = df[df['ORIGIN_AIRPORT']== top_5_delay[0]]
a2 = df[df['ORIGIN_AIRPORT']== top_5_delay[1]]

fig = go.Figure()

fig.add_trace(
    go.Box(
        x = a1["ARRIVAL_DELAY"],
        marker_color = "gold",
        name = str(top_5_delay[0]),
        boxpoints='all',
        boxmean=True
    )
)
fig.add_trace(
    go.Box(
        x = a2["ARRIVAL_DELAY"],
        marker_color = "mediumseagreen",
        name = str(top_5_delay[1]),
        boxpoints='all',
        boxmean=True
    )
)

fig.update_layout(title = "Arrival Delay Distribution", yaxis_title = "Arrival delay")

fig.show()


## Forecast

NeuralProphet: https://medium.com/analytics-vidhya/neuralprophet-a-neural-network-based-time-series-model-3c74af3b0ec6

Variable a predecir: **number of delays per airport and day**

### Aeropuerto 1

In [None]:
airport_1 = df[df['ORIGIN_AIRPORT']== top_5_delay[0]]

# Tenemos que agrupar los datos por día
airport_1_df = pd.DataFrame(airport_1.groupby(airport_1['DATE'])["FLIGHT_NUMBER"].count())
airport_1_df = airport_1_df.rename(columns={"FLIGHT_NUMBER":"TOTAL_FLIGHTS"})
airport_1_df["DELAYED_FLIGHTS"] =  airport_1[airport_1["ARRIVAL_DELAY"]>0].groupby(airport_1['DATE'])["FLIGHT_NUMBER"].count()
airport_1_df["DELAY_PERCENTAGE"] = np.round(airport_1_df['DELAYED_FLIGHTS']/airport_1_df['TOTAL_FLIGHTS']*100,2)

# Ordenamos los aeropuertos de tal manera que los 5 con mayor % de retrasos aparezcan los primeros
airport_1_df = airport_1_df.sort_values('DELAY_PERCENTAGE',ascending=False)
airport_1_df.describe()

In [None]:
airport_1.head()

#### Data preparation

In [None]:
data = airport_1_df.reset_index(level=0, drop=False)
data = data.loc[:,['DATE', 'DELAYED_FLIGHTS']] 
data['DATE'] = pd.to_datetime(data['DATE'])

# Cambiamos el nombre a las columnas para que vaya a corde con el modelo
data.columns = ['ds','y']
data['ds'] = pd.to_datetime(data['ds'],format = "%m/%d/%Y")
data = data.sort_values('ds')
data = data.reset_index(level=0, drop=True)
data.head()

In [None]:
# Representación de los datos
fig = px.line(data, x="ds", y="y", labels={'ds':'Date', 'y':'Number of delays'},
              title = "Delayed flights per day")


fig.update_traces({"line":{"color":"steelblue", 'dash':'dash'}})
fig.show()

Seasonability?

#### Modelling

Model variables explained in: https://neuralprophet.com/code/forecaster.html

In [None]:
model = NeuralProphet(   
    growth="linear",  # Determine trend types: 'linear', 'discontinuous', 'off'
    changepoints=None, # list of dates that may include change points (None -> automatic )
    n_changepoints=5,
    changepoints_range=0.8,
    trend_reg=0,
    trend_reg_threshold=False,
    yearly_seasonality="auto",
    weekly_seasonality="auto",
    daily_seasonality="auto",
    seasonality_mode="additive",
    seasonality_reg=0,
    n_forecasts=30,       # n_forecasts (int) – Number of steps ahead of prediction time step to forecast.
    n_lags= len(data)-30, # n_lags (int) – Previous time series steps to include in auto-regression
    num_hidden_layers=0,
    d_hidden=None,     # Dimension of hidden layers of AR-Net
    #ar_sparsity=None,  # Sparcity in the AR coefficients
    learning_rate=None,
    epochs=40,
    loss_func="Huber",
    normalize="auto",  # Type of normalization ('minmax', 'standardize', 'soft', 'off')
    impute_missing=True,
    #log_level=None, # Determines the logging level of the logger object
)

In [None]:
m.highlight_nth_step_ahead_of_each_forecast(step_number=m.n_forecasts)

In [None]:
# Data partition
train = data [:-30]
test = data [-30:] 

In [None]:
en un modelo de forecast hay que hacer fit al modelo?

In [None]:
# Indicamos que nuestro periodo de predicción son días --> freq = 'D'
model = NeuralProphet()
metrics = model.fit(train, freq="D") 

# Dividir train y test

# Vamos a predecir el mes de diciembre
future = model.make_future_dataframe(data, periods=30) 
forecast = model.predict(future)
forecast.tail()

In [None]:
forecast[-30*2:-30]

In [None]:
forecast = model.predict(data)

In [None]:
plot1 = model.plot(forecast)

In [None]:
plt2 = model.plot_components(forecast)

In [None]:
model.plot_parameters()

#### Training the model

In [None]:
int(1/12*365)

In [None]:
m = NeuralProphet()
df_train, df_val = m.split_df(data, freq='D', valid_p = 30)
metrics = m.fit(df_train, freq='D', validation_df=df_val)

In [None]:
# Make predictions using the fitted model once it has been fitted.
future = m.make_future_dataframe(data, periods=30, n_historic_predictions=len(data)-30)
forecast = m.predict(future)

In [None]:
# Data preparation
# ==============================================================================
airport_1['DATE'] = pd.to_datetime(airport_1['DATE'], format='%Y/%m/%d')

# airport_1
airport_1 = airport_1.set_index('DATE')
airport_1 = airport_1.rename(columns={'ARRIVAL_DELAY': 'y'})
airport_1 = airport_1.asfreq('MS')
airport_1 = airport_1.sort_index()
airport_1.head()

In [None]:
# 20% de los datos para test
steps = int(round(0.2*len(airport_1),0))

data_train = airport_1[:-steps]
data_test  = airport_1[-steps:]

print(f"Train dates : {data_train.DATE.min()} --- {data_train.DATE.max()}  (n={len(data_train)})")
print(f"Test dates  : {data_test.DATE.min()} --- {data_test.DATE.max()}  (n={len(data_test)})")

fig, ax=plt.subplots(figsize=(9, 4))
data_train['y'].plot(ax=ax, label='train')
data_test['y'].plot(ax=ax, label='test')
ax.legend();

#### Exploring the results

In [None]:
airport_1.head()

In [None]:
a1["DEPARTURE_DELAY"] =  pd.DataFrame(df.groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count())

In [None]:
a1 = airport_1.loc[:,['DEPARTURE_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY','OTHER_DELAY']]

In [None]:
a1['DEPARTURE_DELAY'].mean()

In [None]:
causas = pd.DataFrame(df.groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count())
causas = causas.rename(columns={"FLIGHT_NUMBER":"TOTAL_FLIGHTS"})
causas["DEPARTURE_DELAY"] =  df.groupby(df['ORIGIN_AIRPORT'])["DEPARTURE_DELAY"].mean()
causas['AIR_SYSTEM_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["AIR_SYSTEM_DELAY"].mean()
causas['SECURITY_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["SECURITY_DELAY"].mean()
causas['AIRLINE_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["AIRLINE_DELAY"].mean()
causas['LATE_AIRCRAFT_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["LATE_AIRCRAFT_DELAY"].mean()
causas['WEATHER_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["WEATHER_DELAY"].mean()
causas['OTHER_DELAY'] = df.groupby(df['ORIGIN_AIRPORT'])["OTHER_DELAY"].mean()
causas = causas.reset_index(level=0, drop=False)
causas = causas.drop(['TOTAL_FLIGHTS'],axis = 1)
causas.head()

In [None]:
# Contar el número de rergistros != por aeropuerto

In [None]:
# Causas
categories = ['DEPARTURE_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY',',LATE_AIRCRAFT_DELAY','WEATHER_DELAY','OTHER_DELAY']

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=list(causas.iloc[0,1:]),
      theta=categories,
      fill='toself',
      name=str(causas.iloc[0,0])
))
fig.add_trace(go.Scatterpolar(
      r=list(causas.iloc[1,1:]),
      theta=categories,
      fill='toself',
      name=str(causas.iloc[1,0])
))

fig.add_trace(go.Scatterpolar(
      r=list(causas.iloc[2,1:]),
      theta=categories,
      fill='toself',
      name=str(causas.iloc[2,0])
))
              
fig.add_trace(go.Scatterpolar(
      r=list(causas.iloc[3,1:]),
      theta=categories,
      fill='toself',
      name= str(causas.iloc[3,0])
))

fig.add_trace(go.Scatterpolar(
      r=list(causas.iloc[4,1:]),
      theta=categories,
      fill='toself',
      name=str(causas.iloc[4,0])
))


fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 5]
    )),
  showlegend=False
)

fig.show()

In [None]:
df = pd.read_csv("flightsCleaned.csv")
pd.set_option('display.max_columns', None)

In [None]:
# Creamos la columna tipo de vuelos
condition_distance = [
    (df['DISTANCE'] < 750) ,
    (df['DISTANCE'] >= 750) & (df['DISTANCE'] <1500),
    (df['DISTANCE'] >= 1500)]

choice_distance = ['Short', 'Mid', 'Long']

# Creamos la columna tipo de retraso
condition_delay = [
    (df['ARRIVAL_DELAY'] < 0) ,
    (df['ARRIVAL_DELAY'] <= 30) ,
    (df['ARRIVAL_DELAY'] > 30) & (df['DISTANCE'] <=60),
    (df['ARRIVAL_DELAY'] > 60)]

choice_delay = ['Early arrival', '(0-30)mins', '(30-60)mins','>1h']

In [None]:
df['TYPE'] = np.select(condition_distance, choice_distance, default='Not Specified')
df['DELAY_TYPE'] = np.select(condition_delay, choice_delay, default='Not Specified')

In [None]:
# Creamos la columna tipo de retraso
condition_multa = [
    (df['TYPE'] == 'Early arrival') | (df['TYPE'] == '(0-30)mins'),
    (df['DISTANCE'] == 'Short') & (df['ARRIVAL_DELAY'] == '(30-60)mins'),
    (df['DISTANCE'] == 'Mid') & (df['ARRIVAL_DELAY'] == '(30-60)mins'),
    (df['DISTANCE'] == 'Long') & (df['ARRIVAL_DELAY'] == '(30-60)mins'),   
    
    (df['DISTANCE'] == 'Short') & (df['ARRIVAL_DELAY'] == '>1h'),
    (df['DISTANCE'] == 'Mid') & (df['ARRIVAL_DELAY'] == '>1h'),
    (df['DISTANCE'] == 'Long') & (df['ARRIVAL_DELAY'] == '>1h')]

choice_multa = [0,5000,10000,20000,7500,20000,40000]

In [None]:
df['FINE'] = np.select(condition_multa, choice_multa, default='Not Specified')

In [None]:
#df = df.loc[:, ['ORIGIN_AIRPORT','DISTANCE', 'TYPE']]

In [None]:
df['TYPE'] = np.select(conditionlist, choicelist, default='Not Specified')
df = df.loc[:, ['ORIGIN_AIRPORT','DISTANCE', 'TYPE']]

distnaces = pd.DataFrame(df.groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count())
distnaces = distnaces.rename(columns={"FLIGHT_NUMBER":"TOTAL_FLIGHTS"})
distnaces["DELAYED_FLIGHTS"] =  df[df["ARRIVAL_DELAY"]>0].groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count()
distnaces["DELAY_PERCENTAGE"] = np.round(airports['DELAYED_FLIGHTS']/airports['TOTAL_FLIGHTS']*100,2)
