# Feature Engineering
---
Here we make the adjustments required so the data can be used in a model. We:
- Extract features from date.
- Turned Yes/No features into binary.
- Create dummy variables for categorical features.

### Import Packages

In [1]:
import pandas as pd
import numpy as np

import calendar

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import functions as f

### Settings

In [2]:
pd.options.mode.chained_assignment = None

### Load Data

In [3]:
train, test, val = f.load_split_datasets(part='02')

In [4]:
train.shape, test.shape, val.shape

((5256, 14), (1752, 14), (1752, 14))

### Dealing with other categorical features

- `holiday`: map to a binary feature
- `functioning_day`: map to a binary feature
- `seasons`: Create dummy columns for each season. Remove one season for linear independence.

### Turn `holiday` and `functioning_day` into a binary feature

In [5]:
def cols_to_binary(df: pd.DataFrame):

    df.loc[:, 'bin_holiday'] = np.where(df.loc[:, 'holiday'] == 'Holiday', 1, 0)
    df.loc[:, 'bin_functioning_day'] = np.where(df.loc[:, 'functioning_day'] == 'Yes', 1, 0)

    return df

### Create Features from Date

In [6]:
def extract_date_features(df: pd.DataFrame):

    df.loc[:, 'weekday'] = [calendar.day_name[day] for day in df.loc[:, 'date'].dt.weekday]
    df.loc[:, 'month'] = [calendar.month_name[month] for month in df.loc[:, 'date'].dt.month]
    df.loc[:, 'hour'] = df.loc[:, 'hour'].astype(object)
    
    return df

### Drop extra features

In [7]:
def drop_extra_features(df: pd.DataFrame):

    df.drop(columns = ['date', 'dew_point_temperature', 'functioning_day', 'holiday'], inplace= True)

    return df

### Create dummy features for `seasons`

In [8]:
def create_dummy_features(df: pd.DataFrame):

    # Drop to maintain linear independence
    df = pd.get_dummies(df, drop_first = True)

    return df

### Make Pipeline

In [9]:
pipeline = Pipeline(
    [
        ("make_cols_binary", FunctionTransformer(cols_to_binary)),
        ("extract_date_features", FunctionTransformer(extract_date_features)),
        ("drop_extra_features", FunctionTransformer(drop_extra_features)),
        ("create_dummy_features", FunctionTransformer(create_dummy_features)),
    ]
)

In [10]:
train = pipeline.transform(train)
val = pipeline.transform(val)
test = pipeline.transform(test)

### Check data

In [11]:
train.columns

Index(['rented_bike_count', 'temperature', 'humidity', 'wind_speed',
       'visibility', 'solar_radiation', 'rainfall', 'snowfall', 'bin_holiday',
       'bin_functioning_day', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'seasons_Spring',
       'seasons_Summer', 'seasons_Winter', 'weekday_Monday',
       'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday',
       'weekday_Tuesday', 'weekday_Wednesday', 'month_August',
       'month_December', 'month_February', 'month_January', 'month_July',
       'month_June', 'month_March', 'month_May', 'month_November',
       'month_October', 'month_September'],
      dtype='object')

### Save csv

In [12]:
datasets = {
    'train' : train,
    'val' : val,
    'test' : test,}

f.save_split_datasets(datasets=datasets, part='03')