##### Feature Engineering - Modifying, Deleting or Combining existing raw features to crete some new features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv('daily-total-female-births-CA.csv', parse_dates = [0])
df_raw = df.copy()

In [16]:
df.head()

Unnamed: 0,date,births
0,1959-01-01,35
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
date      365 non-null datetime64[ns]
births    365 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 5.8 KB


In [21]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [23]:
df.head()

Unnamed: 0,date,births,year,month,day
0,1959-01-01,35,1959,1,1
1,1959-01-02,32,1959,1,2
2,1959-01-03,30,1959,1,3
3,1959-01-04,31,1959,1,4
4,1959-01-05,44,1959,1,5


For more detail : https://pandas.pydata.org/pandas-docs/stable//reference/series.html#datetimelike-properties

#### Lag Features

- Lag Features: Values at previous time steps.While forecasting this month's sales, it makes sensse to consider last month sales too. (Last same day (7day) in a week)

In [27]:
df['lag1'] = df['births'].shift(1)
df['lag2'] = df['births'].shift(365)

In [28]:
df.head()

Unnamed: 0,date,births,year,month,day,lag1,lag2
0,1959-01-01,35,1959,1,1,,
1,1959-01-02,32,1959,1,2,35.0,
2,1959-01-03,30,1959,1,3,32.0,
3,1959-01-04,31,1959,1,4,30.0,
4,1959-01-05,44,1959,1,5,31.0,


For more detail : https://pandas.pydata.org/pandas-docs/stable//reference/api/pandas.Series.shift.html#pandas.Series.shift

#### Window Features

- Window Features : Avergae values of a set of past time period. (previous 7 days avg, same day in week).


- Rolling Window : for every values, it's previous seven days. 
    * for 10th - average from 3rd to 9th, 
    * for 11th - average from 4th to 10th.
    

- Expanding Window: The start of the date remians the same for the average. 
  * for 10th - average from 3rd to 9th.
  * for 11th - average from 3rd to 10th.

a) Rolling Window

In [35]:
df['Roll_mean'] = df['births'].rolling(window = 2).mean() #gives out the mean value in the last k-values

In [36]:
df.head()

Unnamed: 0,date,births,year,month,day,lag1,lag2,Roll_mean,Roll_max
0,1959-01-01,35,1959,1,1,,,,
1,1959-01-02,32,1959,1,2,35.0,,33.5,
2,1959-01-03,30,1959,1,3,32.0,,31.0,35.0
3,1959-01-04,31,1959,1,4,30.0,,30.5,32.0
4,1959-01-05,44,1959,1,5,31.0,,37.5,44.0


In [37]:
df['Roll_max'] = df['births'].rolling(window = 3).max() #gives out the maximum value in the last k-values

In [38]:
df.head()

Unnamed: 0,date,births,year,month,day,lag1,lag2,Roll_mean,Roll_max
0,1959-01-01,35,1959,1,1,,,,
1,1959-01-02,32,1959,1,2,35.0,,33.5,
2,1959-01-03,30,1959,1,3,32.0,,31.0,35.0
3,1959-01-04,31,1959,1,4,30.0,,30.5,32.0
4,1959-01-05,44,1959,1,5,31.0,,37.5,44.0



For more detail : https://pandas.pydata.org/pandas-docs/stable//reference/api/pandas.Series.rolling.html#pandas.Series.rolling

b) Expand Window

In [39]:
df['Expand_max'] = df['births'].expanding().max() #gives out maximum value among the previous series of data in the feature.

In [40]:
df.head()

Unnamed: 0,date,births,year,month,day,lag1,lag2,Roll_mean,Roll_max,Expand_max
0,1959-01-01,35,1959,1,1,,,,,35.0
1,1959-01-02,32,1959,1,2,35.0,,33.5,,35.0
2,1959-01-03,30,1959,1,3,32.0,,31.0,35.0,35.0
3,1959-01-04,31,1959,1,4,30.0,,30.5,32.0,35.0
4,1959-01-05,44,1959,1,5,31.0,,37.5,44.0,44.0


In [41]:
df['Expand_avg'] = df['births'].expanding().mean() #gives out the average of all the preious values in the series.

In [42]:
df.head()

Unnamed: 0,date,births,year,month,day,lag1,lag2,Roll_mean,Roll_max,Expand_max,Expand_avg
0,1959-01-01,35,1959,1,1,,,,,35.0,35.0
1,1959-01-02,32,1959,1,2,35.0,,33.5,,35.0,33.5
2,1959-01-03,30,1959,1,3,32.0,,31.0,35.0,35.0,32.333333
3,1959-01-04,31,1959,1,4,30.0,,30.5,32.0,35.0,32.0
4,1959-01-05,44,1959,1,5,31.0,,37.5,44.0,44.0,34.4


For more detail : https://pandas.pydata.org/pandas-docs/stable//reference/api/pandas.Series.expanding.html#pandas.Series.expanding