In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

In [2]:
df_train = pd.read_csv("~/Desktop/DataDump/Flights_Weather_TEST.csv", index_col = 'Unnamed: 0')

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'arr_delay', 'origin_city', 'dest_city', 'dep_datetime', 'arr_datetime',
       'dep_Rain', 'dep_Fog', 'dep_Snow', 'dep_Cold', 'dep_Storm', 'dep_Hail',
       'dep_Precipitation', 'arr_Rain', 'arr_Fog', 'arr_Snow', 'arr_Cold',
       'arr_Storm', 'arr_Hail', 'arr_Precipitation'],
      dtype='object')

In [3]:
# Getting day of the week
df = df_train.copy()
df["arr_datetime"] = pd.to_datetime(df['arr_datetime'])
df["dep_datetime"] = pd.to_datetime(df['dep_datetime'])
df['arr_dayofweek'] = df['arr_datetime'].dt.dayofweek
df['dep_dayofweek'] = df['dep_datetime'].dt.dayofweek

In [4]:
#Picking the features
columns_to_keep = ["dep_datetime", "arr_datetime","dep_dayofweek", "arr_dayofweek","op_unique_carrier", "tail_num", "origin", "dest", "crs_elapsed_time", "distance",
                  "dep_Rain","dep_Fog","dep_Snow","dep_Cold", "dep_Storm", "dep_Hail", "dep_Precipitation",
                   "arr_Rain", "arr_Fog", "arr_Snow", "arr_Cold", "arr_Storm", "arr_Hail", "arr_Precipitation", 
                  "crs_dep_time", "crs_arr_time"]
df = df[columns_to_keep]

In [6]:
from sklearn.preprocessing import OrdinalEncoder

def ordinal_encoder_function(df, lst):
    """
    Create ordinal encoders in df given a list of column names
    """
    encoder=OrdinalEncoder()
    for column in lst:
        a=df[column].values.reshape(-1, 1)
        result = encoder.fit_transform(a)
        df[column]=pd.DataFrame(data=result)
    return df

In [7]:
# Encoding weather data as ordinal numerical feature (based on severity)
ord_lst=['dep_Rain','dep_Fog','dep_Snow','dep_Cold','dep_Storm','dep_Hail','dep_Precipitation','arr_Rain','arr_Fog','arr_Snow','arr_Cold','arr_Storm','arr_Hail','arr_Precipitation']
ordinal_encoder_function(df, ord_lst)

Unnamed: 0,arr_delay,dep_datetime,arr_datetime,dep_dayofweek,arr_dayofweek,op_unique_carrier,tail_num,origin,dest,crs_elapsed_time,...,dep_Precipitation,arr_Rain,arr_Fog,arr_Snow,arr_Cold,arr_Storm,arr_Hail,arr_Precipitation,crs_dep_time,crs_arr_time
0,,2019-01-07 05:02:00,2019-01-07 06:10:00,0,0,OO,N744EV,CLL,DFW,68,...,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,502,610
1,2.0,2019-01-07 10:45:00,2019-01-07 11:50:00,0,0,ZW,N424AW,MLI,ORD,65,...,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,1045,1150
2,10.0,2019-01-07 19:30:00,2019-01-07 22:43:00,0,0,OO,N291SY,ORD,LGA,133,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1930,2243
3,55.0,2019-01-07 11:30:00,2019-01-07 14:38:00,0,0,OO,N272SY,ORD,LGA,128,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1130,1438
4,-14.0,2019-01-07 06:00:00,2019-01-07 07:59:00,0,0,OO,N290SY,GSP,LGA,119,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,600,759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144453,-14.0,2019-01-01 17:54:00,2019-01-01 22:10:00,1,1,AA,N158AN,DFW,LGA,196,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1754,2210
144454,42.0,2019-01-01 20:10:00,2019-01-01 23:08:00,1,1,AA,N355PU,ORD,PHX,238,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,2010,2308
144455,-13.0,2019-01-01 19:34:00,2019-01-01 20:48:00,1,1,AA,N931NN,MCO,MIA,74,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1934,2048
144456,-15.0,2019-01-01 17:35:00,2019-01-01 18:49:00,1,1,AA,N931NN,MIA,MCO,74,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1735,1849


In [8]:
def count_flights(df, hours=1):
    """
    Add 2 columns to the dataframe:
    
    flights_origin =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of departure at the departure location
    
    flights_dest =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of arrival at the arrival location
    
    This function is very slow, it requires some optimization
    """
    start = time.time()

    hours = hours*100
    df["flights_origin"] = 0
    df["flights_dest"] = 0
    for i in df.index:
        if i % 5000 == 0 and i != 0:
            print(f"Remaining = {round(100 - (i/max(df.index))*100,2)}%")
            end = time.time()
            tmp = end - start
            print(f"Time elapsed {round(tmp,2)} seconds")
            print(f"Estimated time to finish: {round((tmp*max(df.index))/i*60,2)} minutes")
        origin = df["origin"][i]
        hour_dep = df["crs_dep_time"][i]
        week_day_dep = df["dep_dayofweek"][i]
        week_day_arr = df["arr_dayofweek"][i]
        
        mask = ((df["origin"] == origin) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_dep - hours)) & (df["crs_dep_time"] <= (hour_dep + hours)))|\
                ((df["dest"] == origin) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_dep - hours)) & (df["crs_arr_time"] <= (hour_dep + hours)))
            
        df.at[i,"flights_origin"] = df[mask].shape[0]
        dest = df["dest"][i]
        hour_arr = df["crs_arr_time"][i]
        
        
        mask = ((df["origin"] == dest) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_arr - hours)) & (df["crs_dep_time"] <= (hour_arr + hours)))|\
                ((df["dest"] == dest) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_arr - hours)) & (df["crs_arr_time"] <= (hour_arr + hours)))
            
        df.at[i,"flights_dest"] = df[mask].shape[0]
 
    end = time.time()
    print(f"Time elapsed {round(end - start,2)} seconds")
        
    return(df)

In [None]:
df = count_flights(df)


Remaining = 100.0%
Time elapsed 0.02 seconds
Remaining = 96.54%
Time elapsed 148.06 seconds
Remaining = 93.08%
Time elapsed 297.23 seconds
Remaining = 89.62%
Time elapsed 444.47 seconds
Remaining = 86.16%
Time elapsed 593.65 seconds
Remaining = 82.69%
Time elapsed 741.96 seconds
Remaining = 79.23%
Time elapsed 889.68 seconds
Remaining = 75.77%
Time elapsed 1037.72 seconds
Remaining = 72.31%
Time elapsed 1184.38 seconds
Remaining = 68.85%
Time elapsed 1332.6 seconds
Remaining = 65.39%
Time elapsed 1480.67 seconds
Remaining = 61.93%
Time elapsed 1628.89 seconds
Remaining = 58.47%
Time elapsed 1776.44 seconds
Remaining = 55.0%
Time elapsed 1923.7 seconds
Remaining = 51.54%
Time elapsed 2070.67 seconds
Remaining = 48.08%
Time elapsed 2217.45 seconds
Remaining = 44.62%
Time elapsed 2365.54 seconds
Remaining = 41.16%
Time elapsed 2514.13 seconds
Remaining = 37.7%
Time elapsed 2662.19 seconds
Remaining = 34.24%
Time elapsed 2809.26 seconds
Remaining = 30.78%
Time elapsed 2957.56 seconds
Remai

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.to_csv("~/Desktop/DataDump/Flights_TRAIN.csv", index=False)