# Feature Engineering

In [149]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import joblib

import params as p

In [150]:
train = joblib.load(p.TTV_DATA_PATH + f'02_{p.DATA_SAVE_NAME}_train.pkl')
test = joblib.load(p.TTV_DATA_PATH + f'02_{p.DATA_SAVE_NAME}_test.pkl')
val = joblib.load(p.TTV_DATA_PATH + f'02_{p.DATA_SAVE_NAME}_val.pkl')

In [151]:
train.shape, test.shape, val.shape

((5256, 14), (1752, 14), (1752, 14))

### Filter funcitoning day

In [152]:
def filter_functioning_day(df: pd.DataFrame):

    df = df[df.loc[:, 'functioning_day'] == 'Yes']

    return df

### Dealing with other categorical features

`holiday`: map to a binary features

`seasons`: Create dummy columns for each season. Remove one season for linear independence.

### Turn `holiday` into a binary feature

In [153]:
def holiday_to_binary(df: pd.DataFrame):

    df.loc[:, 'bin_holiday'] = np.where(df.loc[:, 'holiday'] == 'Holiday', 1, 0)

    return df

### Create Features from Date

In [154]:
def extract_date_features(df: pd.DataFrame):

    df.loc[:, 'date_weekday'] = df.loc[:, 'date'].dt.weekday
    df.loc[:, 'date_month'] = df.loc[:, 'date'].dt.month

    return df

### Drop extra features

In [155]:
def drop_extra_features(df: pd.DataFrame):

    df.drop(columns=['date', 'dew_point_temperature', 'functioning_day', 'holiday'], inplace= True)

    return df

### Create dummy features for `seasons`

In [156]:
def create_dummy_features(df: pd.DataFrame):

    df = pd.get_dummies(df)

    # Drop to maintain linear independence
    df.drop(columns = ['seasons_Winter'], inplace=True)

    return df

### Make Pipeline

In [157]:
pipeline = Pipeline(
    [
        ("filter_functioning_day", FunctionTransformer(filter_functioning_day)),
        ("make_holiday_binary", FunctionTransformer(holiday_to_binary)),
        ("extract_date_features", FunctionTransformer(extract_date_features)),
        ("drop_extra_features", FunctionTransformer(drop_extra_features)),
        ("create_dummy_features", FunctionTransformer(create_dummy_features)),
    ]
)

In [158]:
train = pipeline.transform(train)
test = pipeline.transform(test)
val = pipeline.transform(val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Check data

In [161]:
train.head()

Unnamed: 0,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,solar_radiation,rainfall,snowfall,bin_holiday,date_weekday,date_month,seasons_Autumn,seasons_Spring,seasons_Summer
3919,220,7,13.7,98,1.6,468,0.1,0.0,0.0,0,6,5,0,1,0
6606,249,6,20.2,66,1.0,2000,0.0,0.0,0.0,0,4,2,1,0,0
2567,422,23,6.4,59,1.1,961,0.0,0.0,0.0,0,5,3,0,1,0
7618,847,10,13.3,47,0.5,818,1.67,0.0,0.0,0,6,10,1,0,0
3715,1890,19,15.5,46,2.6,1782,0.34,0.0,0.0,0,3,4,0,1,0


### save csv

In [162]:
joblib.dump(train, p.TTV_DATA_PATH + f'03_{p.DATA_SAVE_NAME}_train.pkl')
joblib.dump(test, p.TTV_DATA_PATH + f'03_{p.DATA_SAVE_NAME}_test.pkl')
joblib.dump(val, p.TTV_DATA_PATH + f'03_{p.DATA_SAVE_NAME}_val.pkl')

['./data/ttv_data/03_data_val.pkl']