In [71]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
import pandas as pd
import mlp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

config = mlp.setup.load_config()

preprocess_dict = config["Preprocessing"]

preprocess_dict

## helper functions

def drop_duplicates(df):
    return df.drop_duplicates(['date', 'hr'])

def correct_weather_typos(df):
    pairs_list = [(search_string,value) for value, search_string in preprocess_dict['weather_search_strings'].items()]
    to_replace, value = list(zip(*pairs_list))
    
    def correct_weather(df):
        return df.assign(weather = lambda x: x.weather.str.lower().\
                          replace(to_replace=to_replace, value=value, regex=True))
    
    return df.pipe(correct_weather)

def encode_weather_ordinal(df):
    
    weather_ord_encoding = preprocess_dict['weather_ord_encoding']
    weather, ordinal = list(zip(*weather_ord_encoding.items()))
    lookup_weather = pd.Series(index=weather,
                               data=ordinal,
                               name="weather_encoded")
    
    return df.merge(lookup_weather, left_on="weather", right_index=True).sort_values(["date", "hr"]).drop(columns="weather")

# remove guest_scooter and registered_scooter values below 0

def remove_negative_scooters(df):
    negative_mask = (df.guest_scooter < 0) | (df.registered_scooter < 0)
    return df.loc[~negative_mask]

# remove entries which have anomalous feels_like_temperature values

def remove_anomalous_temps(df):
    def get_z_score(df, col):
        df_new = df.copy()
        df_new[col+"_z_score"] = (df[col] - df[col].mean())/ df[col].std()
        return df_new

    df = df.pipe(get_z_score, "temperature").\
         pipe(get_z_score, "feels_like_temperature").\
         assign(z_score_diff_abs = lambda x: (x.temperature_z_score - x.feels_like_temperature_z_score).abs())
    
    z_threshold = preprocess_dict["z_threshold"]
    
    df["above_threshold"] = df["z_score_diff_abs"] > z_threshold
    
    return df.query("not above_threshold").drop(columns=["temperature_z_score",
                                                         "feels_like_temperature_z_score",
                                                         "z_score_diff_abs",
                                                         "above_threshold"])

def create_time_features(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df.date + ' ' + df.hr.astype(str) + ":00")
    
    df['is_weekend'] = df.datetime.dt.dayofweek.isin([5,6])
    df['month'] = df.datetime.dt.month
    
    rush_hours = preprocess_dict['rush_hours']
    df['is_rush_hour'] = df.hr.isin(rush_hours)
    
    def encode_cyclic(df, var, period):
        df_new = df.copy()
        df_new[ var + "_y"] = np.sin(2*np.pi*df[var]/period)
        df_new[ var + "_x"] = np.cos(2*np.pi*df[var]/period)

        return df_new
    
    return df.pipe(encode_cyclic, "hr", 24)\
             .pipe(encode_cyclic, "month", 12)\
             .drop(columns=["date", "datetime", "hr", "month"])

def combine_scooter_vars(df):
    return df.assign(total_scooter=lambda x: x.guest_scooter + x.registered_scooter)\
              .drop(columns = ["guest_scooter", "registered_scooter"])

def preprocess(raw_df):
    return raw_df.pipe(drop_duplicates)\
                 .pipe(correct_weather_typos)\
                 .pipe(remove_negative_scooters)\
                 .pipe(remove_anomalous_temps)\
                 .pipe(encode_weather_ordinal)\
                 .pipe(create_time_features)\
                 .pipe(combine_scooter_vars).corr().style.background_gradient(cmap="RdYlGn")

In [73]:
config = mlp.setup.load_config()

In [74]:
preprocess_dict = config["Preprocessing"]

In [75]:
preprocess_dict

{'weather_search_strings': {'clear': '\\w*l\\w+r$',
  'cloudy': '\\w*l\\w+dy$',
  'light snow/rain': '\\w*l\\w+t snow.*'},
 'weather_ord_encoding': {'clear': 0,
  'cloudy': 1,
  'light snow/rain': 2,
  'heavy snow/rain': 3},
 'z_threshold': 1.5,
 'rush_hours': [8, 17, 18]}

In [76]:
## helper functions

In [77]:
def drop_duplicates(df):
    return df.drop_duplicates(['date', 'hr'])

In [78]:
def correct_weather_typos(df):
    pairs_list = [(search_string,value) for value, search_string in preprocess_dict['weather_search_strings'].items()]
    to_replace, value = list(zip(*pairs_list))
    
    def correct_weather(df):
        return df.assign(weather = lambda x: x.weather.str.lower().\
                          replace(to_replace=to_replace, value=value, regex=True))
    
    return df.pipe(correct_weather)

In [91]:
def encode_weather_ordinal(df):
    
    weather_ord_encoding = preprocess_dict['weather_ord_encoding']
    weather, ordinal = list(zip(*weather_ord_encoding.items()))
    lookup_weather = pd.Series(index=weather,
                               data=ordinal,
                               name="weather_encoded")
    
    return df.merge(lookup_weather, left_on="weather", right_index=True).sort_values(["date", "hr"]).drop(columns="weather")

In [79]:
# remove guest_scooter and registered_scooter values below 0

def remove_negative_scooters(df):
    negative_mask = (df.guest_scooter < 0) | (df.registered_scooter < 0)
    return df.loc[~negative_mask]

In [80]:
# remove entries which have anomalous feels_like_temperature values

def remove_anomalous_temps(df):
    def get_z_score(df, col):
        df_new = df.copy()
        df_new[col+"_z_score"] = (df[col] - df[col].mean())/ df[col].std()
        return df_new

    df = df.pipe(get_z_score, "temperature").\
         pipe(get_z_score, "feels_like_temperature").\
         assign(z_score_diff_abs = lambda x: (x.temperature_z_score - x.feels_like_temperature_z_score).abs())
    
    z_threshold = preprocess_dict["z_threshold"]
    
    df["above_threshold"] = df["z_score_diff_abs"] > z_threshold
    
    return df.query("not above_threshold").drop(columns=["temperature_z_score",
                                                         "feels_like_temperature_z_score",
                                                         "z_score_diff_abs",
                                                         "above_threshold"])

In [99]:
def create_time_features(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df.date + ' ' + df.hr.astype(str) + ":00")
    
    df['is_weekend'] = df.datetime.dt.dayofweek.isin([5,6])
    df['month'] = df.datetime.dt.month
    
    rush_hours = preprocess_dict['rush_hours']
    df['is_rush_hour'] = df.hr.isin(rush_hours)
    
    def encode_cyclic(df, var, period):
        df_new = df.copy()
        df_new[ var + "_y"] = np.sin(2*np.pi*df[var]/period)
        df_new[ var + "_x"] = np.cos(2*np.pi*df[var]/period)

        return df_new
    
    return df.pipe(encode_cyclic, "hr", 24)\
             .pipe(encode_cyclic, "month", 12)\
             .drop(columns=["date", "datetime", "hr", "month"])

In [107]:
def combine_scooter_vars(df):
    return df.assign(total_scooter=lambda x: x.guest_scooter + x.registered_scooter)\
              .drop(columns = ["guest_scooter", "registered_scooter"])

In [113]:
def preprocess(raw_df):
    return raw_df.pipe(drop_duplicates)\
                 .pipe(correct_weather_typos)\
                 .pipe(remove_negative_scooters)\
                 .pipe(remove_anomalous_temps)\
                 .pipe(encode_weather_ordinal)\
                 .pipe(create_time_features)\
                 .pipe(combine_scooter_vars).corr().style.background_gradient(cmap="RdYlGn")