In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

In [2]:
df_test = pd.read_csv("~/Desktop/DataDump/Flights_Weather_TEST.csv", index_col = 'Unnamed: 0')

In [3]:
# Getting day of the week
df = df_test.copy()
df["arr_datetime"] = pd.to_datetime(df['arr_datetime'])
df["dep_datetime"] = pd.to_datetime(df['dep_datetime'])
df['arr_dayofweek'] = df['arr_datetime'].dt.dayofweek
df['dep_dayofweek'] = df['dep_datetime'].dt.dayofweek

In [4]:
#Picking the features
columns_to_keep = ["fl_date", "mkt_carrier", "op_carrier_fl_num", "origin", "dest", "dep_datetime", "arr_datetime","dep_dayofweek", "arr_dayofweek","op_unique_carrier", "tail_num", "crs_elapsed_time", "distance",
                  "dep_Rain","dep_Fog","dep_Snow","dep_Cold", "dep_Storm", "dep_Hail", "dep_Precipitation",
                   "arr_Rain", "arr_Fog", "arr_Snow", "arr_Cold", "arr_Storm", "arr_Hail", "arr_Precipitation", 
                  "crs_dep_time", "crs_arr_time"]
df = df[columns_to_keep]

In [5]:
from sklearn.preprocessing import OrdinalEncoder

def ordinal_encoder_function(df, lst):
    """
    Create ordinal encoders in df given a list of column names
    """
    encoder=OrdinalEncoder()
    for column in lst:
        a=df[column].values.reshape(-1, 1)
        result = encoder.fit_transform(a)
        df[column]=pd.DataFrame(data=result)
    return df

In [6]:
# Encoding weather data as ordinal numerical feature (based on severity)
ord_lst=['dep_Rain','dep_Fog','dep_Snow','dep_Cold','dep_Storm','dep_Hail','dep_Precipitation','arr_Rain','arr_Fog','arr_Snow','arr_Cold','arr_Storm','arr_Hail','arr_Precipitation']
ordinal_encoder_function(df, ord_lst)

Unnamed: 0,fl_date,mkt_carrier,op_carrier_fl_num,origin,dest,dep_datetime,arr_datetime,dep_dayofweek,arr_dayofweek,op_unique_carrier,...,dep_Precipitation,arr_Rain,arr_Fog,arr_Snow,arr_Cold,arr_Storm,arr_Hail,arr_Precipitation,crs_dep_time,crs_arr_time
0,2020-01-01 00:00:00.000,WN,5888,ONT,SFO,2020-01-01 18:10:00,2020-01-01 19:45:00,2,2,WN,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1810,1945
1,2020-01-01 00:00:00.000,WN,6276,ONT,SFO,2020-01-01 11:50:00,2020-01-01 13:20:00,2,2,WN,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1150,1320
2,2020-01-01 00:00:00.000,WN,4598,ONT,SJC,2020-01-01 20:20:00,2020-01-01 21:30:00,2,2,WN,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,2020,2130
3,2020-01-01 00:00:00.000,WN,4761,ONT,SJC,2020-01-01 13:40:00,2020-01-01 14:55:00,2,2,WN,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1340,1455
4,2020-01-01 00:00:00.000,WN,5162,ONT,SJC,2020-01-01 09:15:00,2020-01-01 10:35:00,2,2,WN,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,915,1035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148461,2020-01-07 00:00:00.000,DL,4813,DTW,JFK,2020-01-07 17:55:00,2020-01-07 19:52:00,1,1,9E,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1755,1952
148462,2020-01-07 00:00:00.000,DL,4814,GSP,LGA,2020-01-07 06:00:00,2020-01-07 07:59:00,1,1,9E,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,600,759
148463,2020-01-07 00:00:00.000,DL,4815,ATL,XNA,2020-01-07 17:15:00,2020-01-07 18:16:00,1,1,9E,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1715,1816
148464,2020-01-07 00:00:00.000,DL,4815,XNA,ATL,2020-01-07 18:51:00,2020-01-07 21:45:00,1,1,9E,...,0.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,1851,2145


In [7]:
def count_flights(df, hours=1):
    """
    Add 2 columns to the dataframe:
    
    flights_origin =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of departure at the departure location
    
    flights_dest =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of arrival at the arrival location
    
    This function is very slow, it requires some optimization
    """
    start = time.time()

    hours = hours*100
    df["flights_origin"] = 0
    df["flights_dest"] = 0
    for i in df.index:
        if i % 5000 == 0 and i != 0:
            print(f"Remaining = {round(100 - (i/max(df.index))*100,2)}%")
            end = time.time()
            tmp = end - start
            print(f"Time elapsed {round(tmp,2)} seconds")
        origin = df["origin"][i]
        hour_dep = df["crs_dep_time"][i]
        week_day_dep = df["dep_dayofweek"][i]
        week_day_arr = df["arr_dayofweek"][i]
        
        mask = ((df["origin"] == origin) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_dep - hours)) & (df["crs_dep_time"] <= (hour_dep + hours)))|\
                ((df["dest"] == origin) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_dep - hours)) & (df["crs_arr_time"] <= (hour_dep + hours)))
            
        df.at[i,"flights_origin"] = df[mask].shape[0]
        dest = df["dest"][i]
        hour_arr = df["crs_arr_time"][i]
        
        
        mask = ((df["origin"] == dest) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_arr - hours)) & (df["crs_dep_time"] <= (hour_arr + hours)))|\
                ((df["dest"] == dest) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_arr - hours)) & (df["crs_arr_time"] <= (hour_arr + hours)))
            
        df.at[i,"flights_dest"] = df[mask].shape[0]
 
    end = time.time()
    print(f"Time elapsed {round(end - start,2)} seconds")
        
    return(df)

In [None]:
df = count_flights(df)


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.to_csv("~/Desktop/DataDump/Flights_TEST.csv", index=False)