In [3]:
#Dependencies
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [4]:
def dummy_getter(df, lst):
    """
    Return dataframe with dummies based on columns list
    """
    for col in lst:
        df=pd.get_dummies(df, columns=[col], prefix=(f'{col}:'))
    return df

In [5]:
def scaler_func(x):
    """
    Returns transformed X using StandardScaler function
    """
    scaler = StandardScaler()
    model = scaler.fit(x)
    return model.transform(x)

In [6]:
#takes a list
def deleter_col(df,lst):
    """
    Returns df with deleted columns given a list
    """
    for col in lst:
        del df[col]
    return df

In [7]:
from sklearn.preprocessing import OrdinalEncoder

def ordinal_encoder_function(df, lst):
    """
    Transforms categorrical variables specified in the list to ordinal following list order
    """
    encoder=OrdinalEncoder()
    for column in lst:
        a=df[column].values.reshape(-1, 1)
        result = encoder.fit_transform(a)
        df[column]=pd.DataFrame(data=result)
    return df

In [8]:
def month_int_to_str(df, col):
    """
    Converts column into month as string
    """
    df[col] = pd.to_datetime(df[col], format='%m').dt.month_name().str.slice(stop=3)
    return df

In [None]:
def evaluate_regression(model, X_test, y_test):
    """
    Calculates R2, MAE, and MSE given model and test data 
    """
    y_pred = model.predict(X_test)
    print("R2: ", r2_score(y_test, y_pred))
    print("MAE: ", mean_absolute_error(y_test, y_pred))
    print("MSE: ", mean_squared_error(y_test, y_pred))

In [None]:
def count_flights(df, hours=1):
    """
    Add 2 columns to the dataframe:
    
    flights_origin =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of departure at the departure location
    
    flights_dest =  count the number of flight departures + arrivals planned to happen within
    + or - hours (parameter) of arrival at the arrival location
    
    This function is very slow, it requires some optimization
    """
    start = time.time()

    hours = hours*100
    df["flights_origin"] = 0
    df["flights_dest"] = 0
    for i in df.index:
        if i % 5000 == 0 and i != 0:
            print(f"Remaining = {round(100 - (i/max(df.index))*100,2)}%")
            end = time.time()
            tmp = end - start
            print(f"Time elapsed {round(tmp,2)} seconds")
        origin = df["origin"][i]
        hour_dep = df["crs_dep_time"][i]
        week_day_dep = df["dep_dayofweek"][i]
        week_day_arr = df["arr_dayofweek"][i]
        
        mask = ((df["origin"] == origin) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_dep - hours)) & (df["crs_dep_time"] <= (hour_dep + hours)))|\
                ((df["dest"] == origin) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_dep - hours)) & (df["crs_arr_time"] <= (hour_dep + hours)))
            
        df.at[i,"flights_origin"] = df[mask].shape[0]
        dest = df["dest"][i]
        hour_arr = df["crs_arr_time"][i]
        
        
        mask = ((df["origin"] == dest) & (df["dep_dayofweek"] == week_day_dep) &
                (df["crs_dep_time"] >= (hour_arr - hours)) & (df["crs_dep_time"] <= (hour_arr + hours)))|\
                ((df["dest"] == dest) & (df["arr_dayofweek"] == week_day_arr) &
                (df["crs_arr_time"] >= (hour_arr - hours)) & (df["crs_arr_time"] <= (hour_arr + hours)))
            
        df.at[i,"flights_dest"] = df[mask].shape[0]
 
    end = time.time()
    print(f"Time elapsed {round(end - start,2)} seconds")
        
    return(df)