## Data Wrangling and Data Exploration Functions

In [17]:
class Data_wrangling:
    
    def __init__(self, df):
        self.df = df
        
    def drop_columns(self, column_names_as_list):
        """ removes columns if exist in dataframe
        Note ones that probably should be removed are ["dup", "index", "no_name", "cancellation_code" ]"""
        for i in range(len(column_names_as_list)):
            if column_names_as_list[i] in df:
                self.df = self.df.drop(column_names_as_list[i], axis = 1)
        return self.df

    def create_haul_type(self):
        """ adds short:0, mid:1, long:2 range haul types from crs_elapsed_time (scheduled) """

        self.df["haul_type"] = self.df['crs_elapsed_time']
        self.df["haul_type"].mask(self.df["haul_type"].values < 180, 0, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] >= 180) & (self.df["haul_type"] < 360), 1, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] >= 360), 2, inplace=True) 
        df["haul_type"]= df["haul_type"].astype('int')
        return self.df
    

    
def split_time_of_day_departure(df):
    """ takes estimated time of departure and splits in to hours 24 hour clock (local time) """
    df['dep_hour'] = df['crs_dep_time']
    df['dep_hour'] = np.floor(df['dep_hour']/100).astype("int")
    return df
  
    
def split_time_of_day_arrival(df):
    """ takes estimated time of arrival and splits in to hours 24 hour clock (local time) """
    df['arr_hour'] = df['crs_arr_time']
    df['arr_hour'] = np.floor(df['arr_hour']/100).astype("int")
    return df
    
    
def split_dest_city_state(df):
    """ separates destination city and states into own columns"""
    df['dest_state'] = df['dest_city_name']
    df['dest_city'] = df['dest_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['dest_state'] = df['dest_state'].apply(f_state)
    df['dest_city'] = df['dest_city'].apply(f_city)
    return df


def split_origin_city_state(df):
    """ separates origin city and states into own columns"""
    df['origin_state'] = df['origin_city_name']
    df['origin_city'] = df['origin_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['origin_state'] = df['origin_state'].apply(f_state)
    df['origin_city'] = df['origin_city'].apply(f_city)
    return df


def add_is_weekend_feature(df):
    """ creates boolean column to indicate if week is weekend 
        https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.weekday.html
        week starts 0 with monday - 0-5 are weekdays (0) and 6,7 weekends (1) """

    df['weekend'] = df['fl_date'].astype('datetime64[ns]')

    f = lambda x: x.weekday()     
    df['weekend'] = df['weekend'].apply(f).astype('int32')
    df['weekend'].replace({
            0:0,
            1:0,
            2:0,
            3:0,
            4:0,
            5:0,
            6:1,
            7:1
        }, inplace=True)
    return df

def add_weekday(df):
    """ creates boolean column to indicate day of week 
        https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.weekday.html
        week starts 0 with monday) """

    df['weekday'] = df['fl_date'].astype('datetime64[ns]')

    f = lambda x: x.weekday()     
    df['weekday'] = df['weekday'].apply(f).astype('int32')
    return df

## Preprocessing

In [None]:
class Preprocessing:
    def __init__(self, df):
        self.df = df
        
    def split_numeric_categorical(self):
        pass
        
    def scale(self, scalar):
        pass
    
    def remove_highly_correlated_columns(self):
        pass
    
    def drop_targets(self):
        """ removes target variables before modelling"""
#         return drop_columns(self.df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay'])
        pass
   

def remove_highly_correlated_features(df, correlation_threshold=0.8):
    #     Anything above correlation threshold will be tossed
    # Assumptions - all numeric, target variable removed
    # step 1
    df_corr = df.corr().abs()

    # step 2
    indices = np.where(df_corr > correlation_threshold)
    indices = [(df_corr.index[x], df_corr.columns[y])
    for x, y in zip(*indices)
        if x != y and x < y]

    # step 3
    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return(df)

 
def remove_small_variance(x, variance_threshold = 0.1):
    # Assumptions - target variable removed, df is numeric
    # import:
    # from sklearn.feature_selection import VarianceThreshold
    vt = VarianceThreshold(variance_threshold)
    x_transformed = vt.fit_transform(x)
    selected_columns = x.columns[vt.get_support()]
    x_transformed = pd.DataFrame(x_transformed, columns = selected_columns)
    return(x_transformed)

def remove_missing_values(x, missing_percent_drop_threshold=0.5):
#     takes in dataframe, removes missing above a percent threshold - percent out of 1
    total = x.isnull().sum().sort_values(ascending=False)
    percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data.head(20)

    to_drop = missing_data[missing_data['Percent'] > missing_percent_drop_threshold].index.tolist()
    return(x.drop(to_drop, axis=1, inplace=True))

In [35]:
# ONE HOT ENCODING

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
    return(res)

cat_feats = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Broad_Item_Type']
# cat_feats = data[].index.tolist()
df_dummy = pd.get_dummies(data[cat_feats])
df_dummy.head()

# encode_and_bind(imdb_movies, 'Rated')

# Formating Test Data

In [77]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../data/flights_samp.csv')

# import fresh df from .csv before using this  otherwise will get double 00:00:00
def convert_testtrain_data_to_test_format(df):
    """ Convert our testing data to be in the same format as the data to test (drop columns and reformat date)"""
    
    #convert date to datetime with 0's
    df.fl_date = (df.fl_date + ' 00:00:00')
    pd.to_datetime(df['fl_date'])
    
    #drop columns not present in test format
    df.drop(columns=['index', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'actual_elapsed_time', 'air_time', 
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'], inplace = True)
    return df
df_new = convert_testtrain_data_to_test_format(df)

In [78]:
def convert_from_test_format_to_fit_predict_format(df):
    """Adds in columns for model fitting and converts to numeric/ encoded categorical for ML model"""
    
    # Split crs_arr_time and crs_dep_time into hour of day (local)
    df = split_time_of_day_departure(df)
    df = split_time_of_day_arrival(df)
    df.drop(columns=['crs_dep_time', 'crs_arr_time'], inplace=True)
    
    # Convert fl_date into day of week  # NOTE MAY WANT TO ADD BACK IN MONTH OR JAN 1 days
    df = add_weekday(df)  
    df.drop(columns=['fl_date'], inplace=True)
    df.weekday = df.weekday.astype(str)
    df = encode_and_bind(df, 'weekday')
    
    # Split city and state 
    split_origin_city_state(df)
    split_dest_city_state(df)
    df.drop(columns=['dest_city_name', 'origin_city_name'], inplace=True)

    # Dest City - Encode
    df = df.dest_city.str.replace({'CA': 10,
                             'TX': 9,
                             'FL': 8,
                             'IL': 7,
                             'NY': 6,
                             'GA': 5,
                             'NC': 4,
                             'CO': 3,
                             'PA': 2,
                             'WA': 1}, regex=False)

    # Dest State - Encode
    
    # Origin City - Encode
    
    # Origin State - Encode

    # Convert Carrier - Encode 
    df = encode_and_bind(df, 'mkt_unique_carrier')

    # Origin Airport - Encode top 10 (rest in 'other') or bin according to passenger or flight volume
    
    # Dest Airport - Encode top 10 or bin according to passenger of flight volume 
    
    # Flight number ??? # drop for now?
    df.drop(columns = ['mkt_carrier_fl_num'], inplace=True)
    
    # Drop rest
    df.drop(columns=['branded_code_share', 'mkt_carrier','op_unique_carrier', 'tail_num', 
                     'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'dup', 'flights'], inplace = True)
    
    
    return df

df_new = convert_from_test_format_to_fit_predict_format(df_new)    


TypeError: replace() missing 1 required positional argument: 'repl'

In [68]:
df_new.columns

Index(['origin', 'dest', 'crs_elapsed_time', 'distance', 'dep_hour',
       'arr_hour', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3',
       'weekday_4', 'weekday_5', 'weekday_6', 'origin_state', 'origin_city',
       'dest_state', 'dest_city', 'mkt_unique_carrier_AA',
       'mkt_unique_carrier_AS', 'mkt_unique_carrier_B6',
       'mkt_unique_carrier_DL', 'mkt_unique_carrier_F9',
       'mkt_unique_carrier_G4', 'mkt_unique_carrier_HA',
       'mkt_unique_carrier_NK', 'mkt_unique_carrier_UA',
       'mkt_unique_carrier_WN'],
      dtype='object')

In [69]:
df_new.head()

Unnamed: 0,origin,dest,crs_elapsed_time,distance,dep_hour,arr_hour,weekday_0,weekday_1,weekday_2,weekday_3,...,mkt_unique_carrier_AA,mkt_unique_carrier_AS,mkt_unique_carrier_B6,mkt_unique_carrier_DL,mkt_unique_carrier_F9,mkt_unique_carrier_G4,mkt_unique_carrier_HA,mkt_unique_carrier_NK,mkt_unique_carrier_UA,mkt_unique_carrier_WN
0,LAX,ORD,251.0,1744.0,0,7,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ORD,AZO,58.0,122.0,21,23,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,OMA,PHX,175.0,1037.0,16,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,TPA,MSY,95.0,488.0,10,10,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,EWR,DCA,86.0,199.0,18,19,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
