<a href="https://colab.research.google.com/github/lennart194/thesis-code/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tsfresh



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

from tsfresh.feature_extraction import extract_features, MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series

In [4]:
final_set = pd.read_csv('/content/drive/MyDrive/M5/final_set.csv')
final_set_X = pd.read_csv('/content/drive/MyDrive/M5/final_set_X.csv')
final_set_Y = pd.read_csv('/content/drive/MyDrive/M5/final_set_Y.csv')

## Encoding of the 'item'-column



In [5]:
final_set_X['item_id'] = final_set_X['item_id'].astype(object)

labenc = LabelEncoder()
final_set_X['item_id'] = labenc.fit_transform(final_set_X['item_id'])
final_set['item_id'] = labenc.fit_transform(final_set['item_id'])

# week window

## Create a new dataset 'lag_demand_set_week' for week features



In [7]:
lag_demand_set_week = final_set[['date', 'item_id', 'demand']]

## Create demand features for seven_day window


*   Create a rolling demand data set for seven days as shift (df_rolled_week)
*   Extract MinimalParameters for df_rolled_week (extracted_week)





In [67]:
df_rolled_week = roll_time_series(lag_demand_set_week, column_id="item_id", column_sort="date", max_timeshift= 6, min_timeshift=6)
extracted_week = extract_features(df_rolled_week, column_id = 'id', column_sort='date', column_value='demand', default_fc_parameters=MinimalFCParameters())

Rolling: 100%|██████████| 1941/1941 [02:34<00:00, 12.58it/s]
Feature Extraction: 100%|██████████| 96729/96729 [01:29<00:00, 1075.63it/s]




*   reset the index and chance the names back to 'date' and 'item_id'
*   sort by those two columns



In [68]:
extracted_week = extracted_week.reset_index()
extracted_week = extracted_week.rename(columns={'level_0':'item_id', 'level_1':'date'})
extracted_week = extracted_week.sort_values(by=['date', 'item_id'])



*   add '_week' to all columns except for 'item_id' and 'date'
*   drop 'demand_lenght'



In [74]:
extracted_week.columns = [str(col) + '_week' for col in extracted_week.columns]
extracted_week = extracted_week.rename(columns={'item_id_week':'item_id', 'date_week':'date'})
extracted_week = extracted_week.drop(columns=['demand__length_week'])

# twoweek window (analogue)

In [25]:
lag_demand_set_twoweek = final_set[['date', 'item_id', 'demand']]

In [33]:
df_rolled_twoweek = roll_time_series(lag_demand_set_twoweek, column_id="item_id", column_sort="date", max_timeshift= 13, min_timeshift=13)
extracted_twoweek = extract_features(df_rolled_twoweek, column_id = 'id', column_sort='date', column_value='demand', default_fc_parameters=MinimalFCParameters())

Rolling: 100%|██████████| 1941/1941 [02:41<00:00, 12.02it/s]
Feature Extraction: 100%|██████████| 96379/96379 [01:28<00:00, 1084.81it/s]


In [34]:
extracted_twoweek = extracted_twoweek.reset_index()
extracted_twoweek = extracted_twoweek.rename(columns={'level_0':'item_id', 'level_1':'date'})
extracted_twoweek = extracted_twoweek.sort_values(by=['date', 'item_id'])

In [57]:
extracted_twoweek.columns = [str(col) + '_twoweek' for col in extracted_twoweek.columns]

In [64]:
extracted_twoweek = extracted_twoweek.rename(columns={'item_id_twoweek':'item_id', 'date_twoweek':'date'})
extracted_twoweek = extracted_twoweek.drop(columns=['demand__length_twoweek'])

## Merging





*   Merge the 18 window features
*   Merge with the wohle final_set (so the corresponding demand values are dropped, too)





In [76]:
extracted_final = extracted_twoweek.merge(extracted_week, on=['date', 'item_id'], how='inner')
ultimative_set = final_set.merge(extracted_final, on=['date', 'item_id'], how='inner')



*   create new ultimative sets



In [85]:
ultimative_set_Y = ultimative_set[['demand']]
ultimative_set_X = ultimative_set.drop('demand', axis=1).set_index('date')



*   save the sets as csv



In [87]:
ultimative_set.to_csv('/content/drive/MyDrive/M5/ultimative_set.csv')
ultimative_set_Y.to_csv('/content/drive/MyDrive/M5/ultimative_set_Y.csv')
ultimative_set_X.to_csv('/content/drive/MyDrive/M5/ultimative_set_X.csv')