<a href="https://colab.research.google.com/github/lennart194/thesis-code/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tsfresh



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

from tsfresh.feature_extraction import extract_features, MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh import select_features

In [None]:
final_set = pd.read_csv('/content/drive/MyDrive/M5/final_set.csv')
final_set_X = pd.read_csv('/content/drive/MyDrive/M5/final_set_X.csv')
final_set_Y = pd.read_csv('/content/drive/MyDrive/M5/final_set_Y.csv')

## Encoding of the 'item'-column



In [None]:
final_set_X['item_id'] = final_set_X['item_id'].astype(object)

labenc = LabelEncoder()
final_set_X['item_id'] = labenc.fit_transform(final_set_X['item_id'])
final_set['item_id'] = labenc.fit_transform(final_set['item_id'])

# week window

## Create a new dataset 'lag_demand_set_week' for week features



In [None]:
lag_demand_set_week = final_set[['date', 'item_id', 'demand']]

## Create demand features for seven_day window


*   Create a rolling demand data set for seven days as shift (df_rolled_week)
*   Extract MinimalParameters for df_rolled_week (extracted_week)





In [None]:
df_rolled_week = roll_time_series(lag_demand_set_week, column_id="item_id", column_sort="date", max_timeshift= 7)
extracted_week = extract_features(df_rolled_week, column_id = 'id', column_sort='date', column_value='demand', default_fc_parameters=MinimalFCParameters())

Rolling: 100%|██████████| 1941/1941 [03:03<00:00, 10.59it/s]
Feature Extraction: 100%|██████████| 97029/97029 [01:31<00:00, 1056.54it/s]


## Edit the extracted values and  lag_demand_set_week (as both need the same index for filtering)



*   set the index  of extracted_week to a mix of date and item 




In [None]:
extracted_week = extracted_week.reset_index()
extracted_week= extracted_week.rename(columns={'level_0':'item_id', 'level_1':'date'})
extracted_week['date_item'] = extracted_week['date'].astype(str) + '_' + extracted_week['item_id'].astype(str)
extracted_week = extracted_week.drop(columns=['item_id', 'date'])
extracted_week = extracted_week.set_index('date_item')



*   set the index of lag_demand_set_week to a mix of data and item
*   convert it into a pandas series



In [None]:
lag_demand_set_week['data_item'] = lag_demand_set_week['date'].astype(str) + '_' + lag_demand_set_week['item_id'].astype(str)
lag_demand_set_week = lag_demand_set_week.drop(columns=['item_id', 'date'])
lag_demand_set_week = lag_demand_set_week.set_index('data_item')
lag_demand_set_week = lag_demand_set_week.squeeze()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## filter relevant features for week

*   use the select_features function



In [None]:
features_filtered_week = select_features(extracted_week, lag_demand_set_week)



*   show the corelation of the filtered features to demand and choose 




In [None]:
lag_demand_set_week = lag_demand_set_week.to_frame()
features_filtered_week['demand'] = lag_demand_set_week['demand']
features_filtered_week.corr()

Unnamed: 0,demand__sum_values,demand__median,demand__mean,demand__standard_deviation,demand__variance,demand__root_mean_square,demand__maximum,demand__absolute_maximum,demand__minimum,demand__length,demand
demand__sum_values,1.0,0.99182,0.999047,0.774949,0.572738,0.996405,0.962473,0.962473,0.928868,0.036975,0.898358
demand__median,0.99182,1.0,0.99286,0.742468,0.542326,0.986624,0.938297,0.938297,0.919202,0.003559,0.889607
demand__mean,0.999047,0.99286,1.0,0.774133,0.572361,0.997264,0.962695,0.962695,0.93074,0.005234,0.899181
demand__standard_deviation,0.774949,0.742468,0.774133,1.0,0.849141,0.816115,0.898944,0.898944,0.538308,0.034505,0.698104
demand__variance,0.572738,0.542326,0.572361,0.849141,1.0,0.617148,0.702553,0.702553,0.350379,0.013201,0.508908
demand__root_mean_square,0.996405,0.986624,0.997264,0.816115,0.617148,1.0,0.977567,0.977567,0.910196,0.007276,0.897756
demand__maximum,0.962473,0.938297,0.962695,0.898944,0.702553,0.977567,1.0,1.0,0.843194,0.018906,0.86894
demand__absolute_maximum,0.962473,0.938297,0.962695,0.898944,0.702553,0.977567,1.0,1.0,0.843194,0.018906,0.86894
demand__minimum,0.928868,0.919202,0.93074,0.538308,0.350379,0.910196,0.843194,0.843194,1.0,-0.01454,0.844417
demand__length,0.036975,0.003559,0.005234,0.034505,0.013201,0.007276,0.018906,0.018906,-0.01454,1.0,0.006552


# month window