## Data Wrangling and Data Exploration Functions

In [1]:
class Data_wrangling:
    
    def __init__(self, df):
        self.df = df
        
    def drop_columns(self, column_names_as_list):
        """ removes columns if exist in dataframe
        Note ones that probably should be removed are ["dup", "index", "no_name", "cancellation_code" ]"""
        for i in range(len(column_names_as_list)):
            if column_names_as_list[i] in df:
                self.df = self.df.drop(column_names_as_list[i], axis = 1)
        return self.df

    def create_haul_type(self):
        """ adds short:0, mid:1, long:2 range haul types from crs_elapsed_time (scheduled) """

        self.df["haul_type"] = self.df['crs_elapsed_time']
        self.df["haul_type"].mask(self.df["haul_type"].values < 180, 0, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] > 180) & (self.df["haul_type"] < 360), 1, inplace=True)
        self.df["haul_type"].mask((self.df["haul_type"] > 360), 2, inplace=True) 
        return self.df
    


    
def split_time_of_day_departure(df):
    """ takes estimated time of departure and splits in to hours 24 hour clock (local time) """
    df['dep_hour'] = df['crs_dep_time']
    df['dep_hour'] = np.floor(df['dep_hour']/100).astype("int")
    return df['dep_hour']
  
    
def split_time_of_day_arrival(df):
    """ takes estimated time of arrival and splits in to hours 24 hour clock (local time) """
    df['arr_hour'] = df['crs_arr_time']
    df['arr_hour'] = np.floor(df['arr_hour']/100).astype("int")
    return df['arr_hour']
    
    
def split_dest_city_state(df):
    """ separates destination city and states into own columns"""
    df['dest_state'] = df['dest_city_name']
    df['dest_city'] = df['dest_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['dest_state'] = df['dest_state'].apply(f_state)
    df['dest_city'] = df['dest_city'].apply(f_city)
    return df


def split_origin_city_state(df):
    """ separates origin city and states into own columns"""
    df['origin_state'] = df['origin_city_name']
    df['origin_city'] = df['origin_city_name']
    
    f_state= lambda x: x.split(sep=', ')[1]
    f_city= lambda x: x.split(sep=', ')[0]

    df['origin_state'] = df['origin_state'].apply(f_state)
    df['origin_city'] = df['origin_city'].apply(f_city)
    return df


def add_is_weekend_feature(df):
    """ creates boolean column to indicate if week is weekend 
        https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.weekday.html
        week starts 0 with monday - 0-5 are weekdays (0) and 6,7 weekends (1) """

    df['weekend'] = df['fl_date'].astype('datetime64[ns]')

    f = lambda x: x.weekday()     
    df['weekend'] = df['weekend'].apply(f).astype('int32')
    df['weekend'].replace({
            0:0,
            1:0,
            2:0,
            3:0,
            4:0,
            5:0,
            6:1,
            7:1
        }, inplace=True)
    return df

## Preprocessing

In [None]:
class Preprocessing:
    def __init__(self, df):
        self.df = df
        
    def split_numeric_categorical(self):
        pass
        
    def scale(self, scalar):
        pass
    
    def remove_highly_correlated_columns(self):
        pass
    
    def drop_targets(self):
        """ removes target variables before modelling"""
#         return drop_columns(self.df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay'])
        pass
   

def remove_highly_correlated_features(df, correlation_threshold=0.8):
    #     Anything above correlation threshold will be tossed
    # Assumptions - all numeric, target variable removed
    # step 1
    df_corr = df.corr().abs()

    # step 2
    indices = np.where(df_corr > correlation_threshold) 
    indices = [(df_corr.index[x], df_corr.columns[y]) 
    for x, y in zip(*indices)
        if x != y and x < y]

    # step 3
    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return(df)

 
def remove_small_variance(x, variance_threshold = 0.1):
    # Assumptions - target variable removed, df is numeric
    # import:
    # from sklearn.feature_selection import VarianceThreshold
    vt = VarianceThreshold(variance_threshold)
    x_transformed = vt.fit_transform(x)
    selected_columns = x.columns[vt.get_support()]
    x_transformed = pd.DataFrame(x_transformed, columns = selected_columns)
    return(x_transformed)

def remove_missing_values(x, missing_percent_drop_threshold=0.5):
#     takes in dataframe, removes missing above a percent threshold - percent out of 1
    total = x.isnull().sum().sort_values(ascending=False)
    percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data.head(20)

    to_drop = missing_data[missing_data['Percent'] > missing_percent_drop_threshold].index.tolist()
    return(x.drop(to_drop, axis=1, inplace=True))