# Feature Engineering

In [39]:
import pandas as pd
import numpy as np

import calendar

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import functions as f

### Settings

In [40]:
pd.options.mode.chained_assignment = None

### Analysis

In [41]:
train, test, val = f.load_split_datasets(part='02')

In [42]:
train.shape, test.shape, val.shape

((5256, 14), (1752, 14), (1752, 14))

### Filter functioning day

In [43]:
def filter_functioning_day(df: pd.DataFrame):

    df = df[df.loc[:, 'functioning_day'] == 'Yes']

    return df

#### Filter for date

# !!! look here !!

### Dealing with other categorical features

`holiday`: map to a binary features

`seasons`: Create dummy columns for each season. Remove one season for linear independence.

### Turn `holiday` into a binary feature

In [44]:
def holiday_to_binary(df: pd.DataFrame):

    df.loc[:, 'bin_holiday'] = np.where(df.loc[:, 'holiday'] == 'Holiday', 1, 0)

    return df

### Create Features from Date

In [45]:
def extract_date_features(df: pd.DataFrame):

    df.loc[:, 'weekday'] = [calendar.day_name[day] for day in df.loc[:, 'date'].dt.weekday]
    df.loc[:, 'month'] = [calendar.month_name[month] for month in df.loc[:, 'date'].dt.month]
    
    return df

### Drop extra features

In [46]:
def drop_extra_features(df: pd.DataFrame):

    df.drop(columns = ['date', 'dew_point_temperature', 'functioning_day', 'holiday'], inplace= True)

    return df

### Create dummy features for `seasons`

In [47]:
def create_dummy_features(df: pd.DataFrame):

    # Drop to maintain linear independence
    df = pd.get_dummies(df, drop_first = True)

    return df

### Make Pipeline

In [48]:
pipeline = Pipeline(
    [
        ("filter_functioning_day", FunctionTransformer(filter_functioning_day)),
        ("make_holiday_binary", FunctionTransformer(holiday_to_binary)),
        ("extract_date_features", FunctionTransformer(extract_date_features)),
        ("drop_extra_features", FunctionTransformer(drop_extra_features)),
        ("create_dummy_features", FunctionTransformer(create_dummy_features)),
    ]
)

In [49]:
train = pipeline.transform(train)
test = pipeline.transform(test)
val = pipeline.transform(val)

### Check data

In [52]:
train.columns

Index(['rented_bike_count', 'hour', 'temperature', 'humidity', 'wind_speed',
       'visibility', 'solar_radiation', 'rainfall', 'snowfall', 'bin_holiday',
       'seasons_Spring', 'seasons_Summer', 'seasons_Winter', 'weekday_Monday',
       'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday',
       'weekday_Tuesday', 'weekday_Wednesday', 'month_August',
       'month_December', 'month_February', 'month_January', 'month_July',
       'month_June', 'month_March', 'month_May', 'month_November',
       'month_October', 'month_September'],
      dtype='object')

### save csv

In [53]:
datasets = {
    'train' : train,
    'test' : test,
    'val' : val}

f.save_split_datasets(datasets=datasets, part='03')