## Data Wrangling and Data Exploration Functions

In [17]:
class Data_wrangling:
    
    def __init__(self, df):
        self.df = df
        
    def drop_columns(self, column_names_as_list):
        """ removes columns if exist in dataframe
        Note ones that probably should be removed are ["dup", "index", "no_name", "cancellation_code" ]"""
        for i in range(len(column_names_as_list)):
            if column_names_as_list[i] in df:
                self.df = self.df.drop(column_names_as_list[i], axis = 1)
        return self.df

    def create_haul_type(self):
        """ adds short:0, mid:1, long:2 range haul types from crs_elapsed_time (scheduled) """

        self.df["haul_type"] = self.df['crs_elapsed_time']
        self.df["haul_type"].mask(self.df["haul_type"].values < 180, 0, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] >= 180) & (self.df["haul_type"] < 360), 1, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] >= 360), 2, inplace=True) 
        df["haul_type"]= df["haul_type"].astype('int')
        return self.df
    

    
def split_time_of_day_departure(df):
    """ takes estimated time of departure and splits in to hours 24 hour clock (local time) """
    df['dep_hour'] = df['crs_dep_time']
    df['dep_hour'] = np.floor(df['dep_hour']/100).astype("int")
    return df
  
    
def split_time_of_day_arrival(df):
    """ takes estimated time of arrival and splits in to hours 24 hour clock (local time) """
    df['arr_hour'] = df['crs_arr_time']
    df['arr_hour'] = np.floor(df['arr_hour']/100).astype("int")
    return df
    
    
def split_dest_city_state(df):
    """ separates destination city and states into own columns"""
    df['dest_state'] = df['dest_city_name']
    df['dest_city'] = df['dest_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['dest_state'] = df['dest_state'].apply(f_state)
    df['dest_city'] = df['dest_city'].apply(f_city)
    return df


def split_origin_city_state(df):
    """ separates origin city and states into own columns"""
    df['origin_state'] = df['origin_city_name']
    df['origin_city'] = df['origin_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['origin_state'] = df['origin_state'].apply(f_state)
    df['origin_city'] = df['origin_city'].apply(f_city)
    return df


def add_is_weekend_feature(df):
    """ creates boolean column to indicate if week is weekend 
        https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.weekday.html
        week starts 0 with monday - 0-5 are weekdays (0) and 6,7 weekends (1) """

    df['weekend'] = df['fl_date'].astype('datetime64[ns]')

    f = lambda x: x.weekday()     
    df['weekend'] = df['weekend'].apply(f).astype('int32')
    df['weekend'].replace({
            0:0,
            1:0,
            2:0,
            3:0,
            4:0,
            5:0,
            6:1,
            7:1
        }, inplace=True)
    return df

def add_weekday(df):
    """ creates boolean column to indicate day of week 
        https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.weekday.html
        week starts 0 with monday) """

    df['weekday'] = df['fl_date'].astype('datetime64[ns]')

    f = lambda x: x.weekday()     
    df['weekday'] = df['weekday'].apply(f).astype('int32')
    return df

## Preprocessing

In [None]:
class Preprocessing:
    def __init__(self, df):
        self.df = df
        
    def split_numeric_categorical(self):
        pass
        
    def scale(self, scalar):
        pass
    
    def remove_highly_correlated_columns(self):
        pass
    
    def drop_targets(self):
        """ removes target variables before modelling"""
#         return drop_columns(self.df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay'])
        pass
   

def remove_highly_correlated_features(df, correlation_threshold=0.8):
    #     Anything above correlation threshold will be tossed
    # Assumptions - all numeric, target variable removed
    # step 1
    df_corr = df.corr().abs()

    # step 2
    indices = np.where(df_corr > correlation_threshold)
    indices = [(df_corr.index[x], df_corr.columns[y])
    for x, y in zip(*indices)
        if x != y and x < y]

    # step 3
    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return(df)

 
def remove_small_variance(x, variance_threshold = 0.1):
    # Assumptions - target variable removed, df is numeric
    # import:
    # from sklearn.feature_selection import VarianceThreshold
    vt = VarianceThreshold(variance_threshold)
    x_transformed = vt.fit_transform(x)
    selected_columns = x.columns[vt.get_support()]
    x_transformed = pd.DataFrame(x_transformed, columns = selected_columns)
    return(x_transformed)

def remove_missing_values(x, missing_percent_drop_threshold=0.5):
#     takes in dataframe, removes missing above a percent threshold - percent out of 1
    total = x.isnull().sum().sort_values(ascending=False)
    percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data.head(20)

    to_drop = missing_data[missing_data['Percent'] > missing_percent_drop_threshold].index.tolist()
    return(x.drop(to_drop, axis=1, inplace=True))

# Formating Test Data

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../data/flights_samp.csv')

# import fresh df from .csv before using this  otherwise will get double 00:00:00
def convert_testtrain_data_to_test_format(df):
    """ Convert our testing data to be in the same format as the data to test (drop columns and reformat date)"""
    
    #convert date to datetime with 0's
    df.fl_date = (df.fl_date + ' 00:00:00')
    pd.to_datetime(df['fl_date'])
    
    #drop columns not present in test format
    df.drop(columns=['index', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'actual_elapsed_time', 'air_time', 
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'], inplace = True)
    return df
df_new = convert_testtrain_data_to_test_format(df)

In [26]:
def convert_test_format_to_fit_predict_format(df):
    """Adds in columns for model fitting and converts to numeric/ encoded categorical for ML model"""
    
    #Split crs_arr_time and crs_dep_time into hour of day (local)
    df = split_time_of_day_departure(df)
    df = split_time_of_day_arrival(df)
    df.drop(columns=['crs_dep_time', 'crs_arr_time'], inplace=True)
    
    #Convert fl_date into day of week  # NOTE MAY WANT TO ADD BACK IN MONTH OR JAN 1 days
    df = add_weekday(df)  #WILL HAVE TO ENCODE THIS
    df.drop(columns=['fl_date'], inplace=True)
    
    #Split city and state 
    split_origin_city_state(df)
    split_dest_city_state(df)
    
    # Drop rest
    df.drop(columns=['branded_code_share', 'mkt_carrier','op_unique_carrier', 'tail_num', 
                     'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'dup', 'flights'], inplace = True)
    
    return df
convert_test_format_to_fit_predict_format(df_new)            

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin,origin_city_name,dest,dest_city_name,crs_elapsed_time,distance,dep_hour,arr_hour,weekday,origin_state,origin_city,dest_state,dest_city
0,AA,362,LAX,"Los Angeles, CA",ORD,"Chicago, IL",251.0,1744.0,0,7,0,CA,Los Angeles,IL,Chicago
1,UA,3788,ORD,"Chicago, IL",AZO,"Kalamazoo, MI",58.0,122.0,21,23,4,IL,Chicago,MI,Kalamazoo
2,WN,5741,OMA,"Omaha, NE",PHX,"Phoenix, AZ",175.0,1037.0,16,18,5,NE,Omaha,AZ,Phoenix
3,WN,1641,TPA,"Tampa, FL",MSY,"New Orleans, LA",95.0,488.0,10,10,1,FL,Tampa,LA,New Orleans
4,UA,4233,EWR,"Newark, NJ",DCA,"Washington, DC",86.0,199.0,18,19,2,NJ,Newark,DC,Washington
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,DL,2173,LAX,"Los Angeles, CA",MSP,"Minneapolis, MN",220.0,1535.0,11,16,1,CA,Los Angeles,MN,Minneapolis
4996,G4,1608,PGD,"Punta Gorda, FL",MDT,"Harrisburg, PA",148.0,963.0,7,9,3,FL,Punta Gorda,PA,Harrisburg
4997,DL,3289,CID,"Cedar Rapids/Iowa City, IA",ATL,"Atlanta, GA",116.0,694.0,16,19,3,IA,Cedar Rapids/Iowa City,GA,Atlanta
4998,AA,2809,LGA,"New York, NY",MIA,"Miami, FL",200.0,1096.0,19,22,5,NY,New York,FL,Miami


In [10]:
convert_test_format_to_fit_predict_format(df)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2019-01-07 00:00:00,AA,AA,AA,362,AA,N161AA,362,12892,LAX,"Los Angeles, CA",13930,ORD,"Chicago, IL",59,710,N,251.0,1.0,1744.0
1,2019-01-04 00:00:00,UA,UA_CODESHARE,UA,3788,ZW,N437AW,3788,13930,ORD,"Chicago, IL",10469,AZO,"Kalamazoo, MI",2135,2333,N,58.0,1.0,122.0
2,2018-12-29 00:00:00,WN,WN,WN,5741,WN,N7738A,5741,13871,OMA,"Omaha, NE",14107,PHX,"Phoenix, AZ",1640,1835,N,175.0,1.0,1037.0
3,2019-01-01 00:00:00,WN,WN,WN,1641,WN,N423WN,1641,15304,TPA,"Tampa, FL",13495,MSY,"New Orleans, LA",1015,1050,N,95.0,1.0,488.0
4,2019-01-02 00:00:00,UA,UA_CODESHARE,UA,4233,EV,N14558,4233,11618,EWR,"Newark, NJ",11278,DCA,"Washington, DC",1825,1951,N,86.0,1.0,199.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2019-01-01 00:00:00,DL,DL,DL,2173,DL,N535US,2173,12892,LAX,"Los Angeles, CA",13487,MSP,"Minneapolis, MN",1100,1640,N,220.0,1.0,1535.0
4996,2019-01-10 00:00:00,G4,G4,G4,1608,G4,253NV,1608,14082,PGD,"Punta Gorda, FL",13230,MDT,"Harrisburg, PA",720,948,N,148.0,1.0,963.0
4997,2019-01-10 00:00:00,DL,DL_CODESHARE,DL,3289,9E,N294PQ,3289,11003,CID,"Cedar Rapids/Iowa City, IA",10397,ATL,"Atlanta, GA",1646,1942,N,116.0,1.0,694.0
4998,2019-01-12 00:00:00,AA,AA,AA,2809,AA,N338RS,2809,12953,LGA,"New York, NY",13303,MIA,"Miami, FL",1929,2249,N,200.0,1.0,1096.0
