In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
import sklearn
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
calendar = pd.read_csv('../data/calendar.csv')
sales_train_val = pd.read_csv('../data/sales_train_validation.csv')
sales_train_eval = pd.read_csv('../data/sales_train_evaluation.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')

In [3]:
# Warning: use only 10 out of the 3049 items to start
test_item10 = list(sales_train_eval['item_id'].unique()[:10])
sales_train_val = sales_train_val[sales_train_val['item_id'].isin(test_item10)]
sales_train_eval = sales_train_eval[sales_train_eval['item_id'].isin(test_item10)]
print(f"number of unique type of itemIDs in eval data: {sales_train_eval['item_id'].nunique()}")
print(f"number of unique type of itemIDs in val data: {sales_train_val['item_id'].nunique()}")

number of unique type of itemIDs in eval data: 10
number of unique type of itemIDs in val data: 10


### 01 Preprocessing Data
#### 1.1 Calendar Data
- Convert date to datetime format - Yes
- convert the following variables to numeric: d  - No
- LabelEncoding the following variable: weekday/is_holiday - No
- Create Column "is_holiday" - Yes
- Warning: consider removing event_name_1&2 + event_type_1&2  !!!! - Yes
- Warning: consider label encoding  - No

In [4]:
display(calendar.info())
display(calendar.head().T)
display(calendar.describe().T)
display(calendar.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


None

Unnamed: 0,0,1,2,3,4
date,2011-01-29,2011-01-30,2011-01-31,2011-02-01,2011-02-02
wm_yr_wk,11101,11101,11101,11101,11101
weekday,Saturday,Sunday,Monday,Tuesday,Wednesday
wday,1,2,3,4,5
month,1,1,1,2,2
year,2011,2011,2011,2011,2011
d,d_1,d_2,d_3,d_4,d_5
event_name_1,,,,,
event_type_1,,,,,
event_name_2,,,,,


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wm_yr_wk,1969.0,11347.086338,155.277043,11101.0,11219.0,11337.0,11502.0,11621.0
wday,1969.0,3.997461,2.001141,1.0,2.0,4.0,6.0,7.0
month,1969.0,6.325546,3.416864,1.0,3.0,6.0,9.0,12.0
year,1969.0,2013.288471,1.580198,2011.0,2012.0,2013.0,2015.0,2016.0
snap_CA,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0
snap_TX,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0
snap_WI,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0


date               0
wm_yr_wk           0
weekday            0
wday               0
month              0
year               0
d                  0
event_name_1    1807
event_type_1    1807
event_name_2    1964
event_type_2    1964
snap_CA            0
snap_TX            0
snap_WI            0
dtype: int64

In [5]:
print("Unique Values of weekday column", calendar['weekday'].unique())
print("________________________________")
print("Count of event_name_1&2:")
print(calendar["event_name_1"].value_counts())
print("________________________________")
print("Count of event_type_2:")
print(calendar["event_type_1"].value_counts())


Unique Values of weekday column ['Saturday' 'Sunday' 'Monday' 'Tuesday' 'Wednesday' 'Thursday' 'Friday']
________________________________
Count of event_name_1&2:
event_name_1
SuperBowl              6
Pesach End             6
Ramadan starts         6
ValentinesDay          6
NBAFinalsEnd           6
NBAFinalsStart         6
MemorialDay            6
Mother's day           6
Purim End              6
StPatricksDay          6
LentWeek2              6
LentStart              6
PresidentsDay          6
MartinLutherKingDay    5
OrthodoxChristmas      5
EidAlAdha              5
NewYear                5
Chanukah End           5
Christmas              5
Thanksgiving           5
VeteransDay            5
IndependenceDay        5
Halloween              5
ColumbusDay            5
LaborDay               5
Eid al-Fitr            5
Cinco De Mayo          5
OrthodoxEaster         5
Easter                 5
Father's day           4
Name: count, dtype: int64
________________________________
Count of event_

In [6]:
# Convert 'date' column to datetime format
calendar['date'] = pd.to_datetime(calendar['date'])
# create a vairable "is_holiday"
calendar['is_holiday'] = np.where(
    (~calendar['event_name_1'].isna()) | (~calendar['event_type_1'].isna()) | 
    (~calendar['event_name_1'].isna()) | (~calendar['event_type_2'].isna()), 1, 0
    )
# drop 'event_name_1','event_type_1','event_name_2','event_type_2'
calendar = calendar.drop(columns=['event_name_1','event_type_1','event_name_2','event_type_2']) 

In [7]:
display(calendar.info())
display(calendar.head().T)
display(calendar.describe().T)
display(calendar.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        1969 non-null   datetime64[ns]
 1   wm_yr_wk    1969 non-null   int64         
 2   weekday     1969 non-null   object        
 3   wday        1969 non-null   int64         
 4   month       1969 non-null   int64         
 5   year        1969 non-null   int64         
 6   d           1969 non-null   object        
 7   snap_CA     1969 non-null   int64         
 8   snap_TX     1969 non-null   int64         
 9   snap_WI     1969 non-null   int64         
 10  is_holiday  1969 non-null   int64         
dtypes: datetime64[ns](1), int64(8), object(2)
memory usage: 169.3+ KB


None

Unnamed: 0,0,1,2,3,4
date,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,2011-02-02 00:00:00
wm_yr_wk,11101,11101,11101,11101,11101
weekday,Saturday,Sunday,Monday,Tuesday,Wednesday
wday,1,2,3,4,5
month,1,1,1,2,2
year,2011,2011,2011,2011,2011
d,d_1,d_2,d_3,d_4,d_5
snap_CA,0,0,0,1,1
snap_TX,0,0,0,1,0
snap_WI,0,0,0,0,1


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,1969.0,2013-10-09 00:00:00,2011-01-29 00:00:00,2012-06-04 00:00:00,2013-10-09 00:00:00,2015-02-13 00:00:00,2016-06-19 00:00:00,
wm_yr_wk,1969.0,11347.086338,11101.0,11219.0,11337.0,11502.0,11621.0,155.277043
wday,1969.0,3.997461,1.0,2.0,4.0,6.0,7.0,2.001141
month,1969.0,6.325546,1.0,3.0,6.0,9.0,12.0,3.416864
year,1969.0,2013.288471,2011.0,2012.0,2013.0,2015.0,2016.0,1.580198
snap_CA,1969.0,0.330117,0.0,0.0,0.0,1.0,1.0,0.470374
snap_TX,1969.0,0.330117,0.0,0.0,0.0,1.0,1.0,0.470374
snap_WI,1969.0,0.330117,0.0,0.0,0.0,1.0,1.0,0.470374
is_holiday,1969.0,0.082275,0.0,0.0,0.0,0.0,1.0,0.274853


date          0
wm_yr_wk      0
weekday       0
wday          0
month         0
year          0
d             0
snap_CA       0
snap_TX       0
snap_WI       0
is_holiday    0
dtype: int64

#### 1.2 Sell Prices Data
The store and item IDs together with the sales price of the item as a weekly average.

In [8]:
display(sell_prices.info()) 
display(sell_prices.head()) 
display(sell_prices.describe().T)
display(sell_prices.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB


None

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wm_yr_wk,6841121.0,11382.943423,148.610026,11101.0,11247.0,11411.0,11517.0,11621.0
sell_price,6841121.0,4.410952,3.408814,0.01,2.18,3.47,5.84,107.32


store_id      0
item_id       0
wm_yr_wk      0
sell_price    0
dtype: int64

#### 1.3 Sales_train_validation
- Note: the maximum wm_yr_wk is 11621 (4week+ 11617 week in sals_evaluation), which covers the 28 dates we are gonna predict
- Note: 1941 days in evaluation data, while 1913 days in validation data
- Pivot the dataframe d1 to d1913 to rows - Yes

In [9]:
display(sales_train_val.info())
display("shape", sales_train_val.shape)
display(sales_train_val.head())
display(sales_train_val.describe().T)
display(sales_train_val.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 27450
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 1.5+ MB


None

'shape'

(100, 1919)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
d_1,100.0,1.08,2.791329,0.0,0.0,0.0,0.25,16.0
d_2,100.0,0.85,2.328458,0.0,0.0,0.0,0.00,15.0
d_3,100.0,0.83,3.629439,0.0,0.0,0.0,0.00,33.0
d_4,100.0,0.82,2.426204,0.0,0.0,0.0,0.00,13.0
d_5,100.0,0.78,5.336325,0.0,0.0,0.0,0.00,53.0
...,...,...,...,...,...,...,...,...
d_1909,100.0,0.78,1.617955,0.0,0.0,0.0,1.00,10.0
d_1910,100.0,1.03,2.235865,0.0,0.0,0.0,1.00,15.0
d_1911,100.0,0.77,1.632096,0.0,0.0,0.0,1.00,12.0
d_1912,100.0,1.30,2.346284,0.0,0.0,1.0,1.25,15.0


id          0
item_id     0
dept_id     0
cat_id      0
store_id    0
           ..
d_1909      0
d_1910      0
d_1911      0
d_1912      0
d_1913      0
Length: 1919, dtype: int64

In [10]:
# Pivot the dataframe d1 to d1913 to rows
sales_train_val_T = sales_train_val.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_val.columns[sales_train_val.columns.get_loc('d_1'):sales_train_val.columns.get_loc('d_1913')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_val_T.head())
display(sales_train_val_T.tail())
display(sales_train_val_T.shape)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
191295,HOBBIES_1_006_WI_3_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,WI_3,WI,d_1913,2
191296,HOBBIES_1_007_WI_3_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,WI_3,WI,d_1913,0
191297,HOBBIES_1_008_WI_3_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,WI_3,WI,d_1913,3
191298,HOBBIES_1_009_WI_3_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,WI_3,WI,d_1913,0
191299,HOBBIES_1_010_WI_3_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,WI_3,WI,d_1913,0


(191300, 8)

#### 1.4 Sales_train_evaluation
- Pivot the dataframe d1 to d1941 to rows - Yes

In [11]:
display(sales_train_eval.info())
display(sales_train_eval.head())
display(sales_train_eval.describe().T)
display(sales_train_eval.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 27450
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 1.5+ MB


None

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
d_1,100.0,1.08,2.791329,0.0,0.0,0.0,0.25,16.0
d_2,100.0,0.85,2.328458,0.0,0.0,0.0,0.00,15.0
d_3,100.0,0.83,3.629439,0.0,0.0,0.0,0.00,33.0
d_4,100.0,0.82,2.426204,0.0,0.0,0.0,0.00,13.0
d_5,100.0,0.78,5.336325,0.0,0.0,0.0,0.00,53.0
...,...,...,...,...,...,...,...,...
d_1937,100.0,0.98,2.169485,0.0,0.0,0.0,1.00,14.0
d_1938,100.0,0.91,2.270118,0.0,0.0,0.0,1.00,18.0
d_1939,100.0,1.07,2.085326,0.0,0.0,0.0,1.00,13.0
d_1940,100.0,1.79,4.684587,0.0,0.0,0.0,2.00,40.0


id          0
item_id     0
dept_id     0
cat_id      0
store_id    0
           ..
d_1937      0
d_1938      0
d_1939      0
d_1940      0
d_1941      0
Length: 1947, dtype: int64

In [12]:
# Pivot the dataframe d1 to d1941 to rows
sales_train_eval_T = sales_train_eval.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_eval.columns[sales_train_eval.columns.get_loc('d_1'):sales_train_eval.columns.get_loc('d_1941')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_eval_T.head())
display(sales_train_eval_T.tail())
display(sales_train_eval_T.shape)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
194095,HOBBIES_1_006_WI_3_evaluation,HOBBIES_1_006,HOBBIES_1,HOBBIES,WI_3,WI,d_1941,0
194096,HOBBIES_1_007_WI_3_evaluation,HOBBIES_1_007,HOBBIES_1,HOBBIES,WI_3,WI,d_1941,1
194097,HOBBIES_1_008_WI_3_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,WI_3,WI,d_1941,0
194098,HOBBIES_1_009_WI_3_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,WI_3,WI,d_1941,1
194099,HOBBIES_1_010_WI_3_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,WI_3,WI,d_1941,1


(194100, 8)

In [13]:
display(sales_train_eval_T.info())
display(sales_train_eval_T.head())
display(sales_train_eval_T.describe().T)
display(sales_train_eval_T.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194100 entries, 0 to 194099
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        194100 non-null  object
 1   item_id   194100 non-null  object
 2   dept_id   194100 non-null  object
 3   cat_id    194100 non-null  object
 4   store_id  194100 non-null  object
 5   state_id  194100 non-null  object
 6   d         194100 non-null  object
 7   sales     194100 non-null  int64 
dtypes: int64(1), object(7)
memory usage: 11.8+ MB


None

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sales,194100.0,1.022009,2.879144,0.0,0.0,0.0,1.0,93.0


id          0
item_id     0
dept_id     0
cat_id      0
store_id    0
state_id    0
d           0
sales       0
dtype: int64

#### 1.5 Append Sales_train_validation and Sales_train_evaluation
- Derive column "d" for the following data assembling purpose 

In [14]:
print("sales_train_val_T.shape: ", sales_train_val_T.shape)
print("sales_train_eval_T.shape: ", sales_train_eval_T.shape)

sales_train_val_T.shape:  (191300, 8)
sales_train_eval_T.shape:  (194100, 8)


In [15]:
# Concatenate sales_train_val_T and sales_train_eval_T
sales_train = pd.concat([sales_train_val_T, sales_train_eval_T], ignore_index=True)
print("sales_train.shape: ", sales_train.shape)

sales_train.shape:  (385400, 8)


#### 1.6 Assemble sales_train with Calendar + Sell_prices
- Create column day, extract the day number from the 'd' column - Yes
- Derive var 'is_snap' based on state_id and snap_CA/TX/WI
- drop columns d, snap_CA/TX/WI column


In [16]:
# merge sales_train with calendar on 'd' + sell_prices on 'wm_yr_wk'
sales_train2 = sales_train.merge(calendar, on='d',how='left').merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'],how='left')

In [17]:
# extract the day number from the 'd' column
sales_train2['day'] = sales_train2['d'].str.extract('d_(\d+)').astype(int)

# display(sales_train2.head())
# display(sales_train2.tail())

In [18]:
# Derive var 'is_snap' based on state_id and snap_CA/TX/WI
conditions = [
    sales_train2['state_id'] == 'CA',
    sales_train2['state_id'] == 'TX',
    sales_train2['state_id'] == 'WI'
]

choices = [
    sales_train2['snap_CA'],
    sales_train2['snap_TX'],
    sales_train2['snap_WI']
]

sales_train2['is_snap'] = np.select(conditions, choices, default=0)

In [19]:
# drop columns - 'd','snap_CA','snap_TX','snap_WI'
sales_train3 = sales_train2.drop(columns=['d','snap_CA','snap_TX','snap_WI'])

In [20]:
display(sales_train3.info())
display(sales_train3.tail().T)
display(sales_train3.describe().T)
display(sales_train3.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385400 entries, 0 to 385399
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id          385400 non-null  object        
 1   item_id     385400 non-null  object        
 2   dept_id     385400 non-null  object        
 3   cat_id      385400 non-null  object        
 4   store_id    385400 non-null  object        
 5   state_id    385400 non-null  object        
 6   sales       385400 non-null  int64         
 7   date        385400 non-null  datetime64[ns]
 8   wm_yr_wk    385400 non-null  int64         
 9   weekday     385400 non-null  object        
 10  wday        385400 non-null  int64         
 11  month       385400 non-null  int64         
 12  year        385400 non-null  int64         
 13  is_holiday  385400 non-null  int64         
 14  sell_price  318508 non-null  float64       
 15  day         385400 non-null  int64         
 16  is

None

Unnamed: 0,385395,385396,385397,385398,385399
id,HOBBIES_1_006_WI_3_evaluation,HOBBIES_1_007_WI_3_evaluation,HOBBIES_1_008_WI_3_evaluation,HOBBIES_1_009_WI_3_evaluation,HOBBIES_1_010_WI_3_evaluation
item_id,HOBBIES_1_006,HOBBIES_1_007,HOBBIES_1_008,HOBBIES_1_009,HOBBIES_1_010
dept_id,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1
cat_id,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES
store_id,WI_3,WI_3,WI_3,WI_3,WI_3
state_id,WI,WI,WI,WI,WI
sales,0,1,0,1,1
date,2016-05-22 00:00:00,2016-05-22 00:00:00,2016-05-22 00:00:00,2016-05-22 00:00:00,2016-05-22 00:00:00
wm_yr_wk,11617,11617,11617,11617,11617
weekday,Sunday,Sunday,Sunday,Sunday,Sunday


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
sales,385400.0,1.021868,0.0,0.0,0.0,1.0,93.0,2.881376
date,385400.0,2013-09-18 01:13:13.980280320,2011-01-29 00:00:00,2012-05-24 00:00:00,2013-09-18 00:00:00,2015-01-13 00:00:00,2016-05-22 00:00:00,
wm_yr_wk,385400.0,11341.193565,11101.0,11217.0,11334.0,11450.0,11617.0,151.644252
wday,385400.0,3.997405,1.0,2.0,4.0,6.0,7.0,2.000649
month,385400.0,6.346134,1.0,3.0,6.0,9.0,12.0,3.448952
year,385400.0,2013.229372,2011.0,2012.0,2013.0,2015.0,2016.0,1.544804
is_holiday,385400.0,0.080955,0.0,0.0,0.0,0.0,1.0,0.272766
sell_price,318508.0,3.39423,0.42,1.64,2.97,4.34,9.58,2.322188
day,385400.0,964.050856,1.0,482.0,964.0,1446.0,1941.0,556.365708
is_snap,385400.0,0.329528,0.0,0.0,0.0,1.0,1.0,0.470042


id                0
item_id           0
dept_id           0
cat_id            0
store_id          0
state_id          0
sales             0
date              0
wm_yr_wk          0
weekday           0
wday              0
month             0
year              0
is_holiday        0
sell_price    66892
day               0
is_snap           0
dtype: int64

### 02 Feature Engineering 
#### 2.1 - DateTime Features
- Existing: year, month, wm_yr_wk, wday, day
- Drop: weekday, date
- Create: is_weekend 0-(wday in 3,4,5,6), 1-(wday in 7), 2-(wday in 1,2) 
#### 2.2 - ID Features
- Existing: id, item_id, dept_id, cat_id, store_id, state_id  
- Create: state_item_id
          store_item_id
#### 2.3 - Price Features
- "mean_store_item_price", "min_store_item_price", "max_store_item_price", "std_store_item_price"
- "mean_yr_store_item_price", "min_yr_store_item_price", "max_yr_store_item_price", 
"std_yr_store_item_price"
- "mean_m_store_item_price", "min_m_store_item_price", "max_m_store_item_price", 
"std_m_store_item_price"
----------------------------
- "mean_state_item_price", "min_state_item_price", "max_state_item_price", "std_state_item_price"
- "mean_yr_state_item_price", "min_yr_state_item_price", "max_yr_state_item_price", 
"std_yr_state_item_price"
- "mean_m_state_item_price", "min_m_state_item_price", "max_m_state_item_price", 
"std_m_state_item_price"
----------------------------
- "mean_item_price", "min_item_price", "max_item_price", "std_item_price"
- "mean_yr_item_price", "min_yr_item_price", "max_yr_item_price", "std_yr_item_price"
- "mean_m_item_price", "min_m_item_price", "max_m_item_price", "std_m_item_price"
- "mean_w_item_price", "min_w_item_price", "max_w_item_price", "std_w_item_price"

#### 2.4 - Sales Features

In [21]:
#### 2.1 - DateTime Features 
# Drop: weekday, date
sales_train4 = sales_train3.drop(columns=['weekday','date'])

# create is_weekend: 0-(wday in 3,4,5,6), 1-(wday in 7), 2-(wday in 1,2) 
# Map day to is_weekend categories
def categorize_day(day):
    if day in [3, 4, 5, 6]:
        return 0
    elif day == 7:
        return 1
    elif day in [1, 2]:
        return 2

sales_train4['is_weekend'] = sales_train4['wday'].apply(categorize_day)

In [22]:
#### 2.2 - ID Features
# - Create: state_item_id
#           store_item_id
sales_train4['state_item_id'] = sales_train4['state_id'].astype(str) + "_" +  sales_train4['item_id'].astype(str)
sales_train4['store_item_id'] = sales_train4['store_id'].astype(str) + "_" +  sales_train4['item_id'].astype(str)

In [23]:
sales_train4.head().T

Unnamed: 0,0,1,2,3,4
id,HOBBIES_1_001_CA_1_validation,HOBBIES_1_002_CA_1_validation,HOBBIES_1_003_CA_1_validation,HOBBIES_1_004_CA_1_validation,HOBBIES_1_005_CA_1_validation
item_id,HOBBIES_1_001,HOBBIES_1_002,HOBBIES_1_003,HOBBIES_1_004,HOBBIES_1_005
dept_id,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1
cat_id,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES
store_id,CA_1,CA_1,CA_1,CA_1,CA_1
state_id,CA,CA,CA,CA,CA
sales,0,0,0,0,0
wm_yr_wk,11101,11101,11101,11101,11101
wday,1,1,1,1,1
month,1,1,1,1,1


In [24]:
#### 2.3 
# "mean_store_item_price", "min_store_item_price", "max_store_item_price", "std_store_item_price"
# "mean_yr_store_item_price", "min_yr_store_item_price", "max_yr_store_item_price", "std_yr_store_item_price"
# "mean_m_store_item_price", "min_m_store_item_price", "max_m_store_item_price", "std_m_store_item_price"

# - "mean_state_item_price", "min_state_item_price", "max_state_item_price", "std_state_item_price"
# - "mean_yr_state_item_price", "min_yr_state_item_price", "max_yr_state_item_price", 
# "std_yr_state_item_price"
# - "mean_m_state_item_price", "min_m_state_item_price", "max_m_state_item_price", 
# "std_m_state_item_price"

def add_aggregated_column(df, group_by_cols, agg_col, agg_func, new_col_name):
    """
    Adds an aggregated column to the dataframe by performing groupby and merge.
    
    Parameters:
        df (pd.DataFrame): The original dataframe.
        group_by_cols (list): Columns to group by.
        agg_col (str): Column to aggregate.
        agg_func (str): Aggregation function (e.g., 'mean', 'max', 'min', 'std').
        new_col_name (str): Name of the new column to be added.
    
    Returns:
        pd.DataFrame: Dataframe with the new aggregated column added.
    """
    group = df.groupby(group_by_cols).agg({agg_col: agg_func}).reset_index()
    group = group.rename(columns={agg_col: new_col_name})
    return df.merge(group, on=group_by_cols, how='left')

# List of aggregations
aggregations = [
    (['store_item_id'], 'sell_price', 'mean', 'mean_store_item_price'),
    (['store_item_id'], 'sell_price', 'max', 'max_store_item_price'),
    (['store_item_id'], 'sell_price', 'min', 'min_store_item_price'),
    (['store_item_id'], 'sell_price', 'std', 'std_store_item_price'),

    (['year', 'store_item_id'], 'sell_price', 'mean', 'mean_yr_store_item_price'),
    (['year', 'store_item_id'], 'sell_price', 'max', 'max_yr_store_item_price'),
    (['year', 'store_item_id'], 'sell_price', 'min', 'min_yr_store_item_price'),
    (['year', 'store_item_id'], 'sell_price', 'std', 'std_yr_store_item_price'),

    (['month', 'store_item_id'], 'sell_price', 'mean', 'mean_m_store_item_price'),
    (['month', 'store_item_id'], 'sell_price', 'max', 'max_m_store_item_price'),
    (['month', 'store_item_id'], 'sell_price', 'min', 'min_m_store_item_price'),
    (['month', 'store_item_id'], 'sell_price', 'std', 'std_m_store_item_price'),



    (['state_item_id'], 'sell_price', 'mean', 'mean_state_item_price'),
    (['state_item_id'], 'sell_price', 'max', 'max_state_item_price'),
    (['state_item_id'], 'sell_price', 'min', 'min_state_item_price'),
    (['state_item_id'], 'sell_price', 'std', 'std_state_item_price'),

    (['year', 'state_item_id'], 'sell_price', 'mean', 'mean_yr_state_item_price'),
    (['year', 'state_item_id'], 'sell_price', 'max', 'max_yr_state_item_price'),
    (['year', 'state_item_id'], 'sell_price', 'min', 'min_yr_state_item_price'),
    (['year', 'state_item_id'], 'sell_price', 'std', 'std_yr_state_item_price'),

    (['month', 'state_item_id'], 'sell_price', 'mean', 'mean_m_state_item_price'),
    (['month', 'state_item_id'], 'sell_price', 'max', 'max_m_state_item_price'),
    (['month', 'state_item_id'], 'sell_price', 'min', 'min_m_state_item_price'),
    (['month', 'state_item_id'], 'sell_price', 'std', 'std_m_state_item_price'),



    (['item_id'], 'sell_price', 'mean', 'mean_item_price'),
    (['item_id'], 'sell_price', 'max', 'max_item_price'),
    (['item_id'], 'sell_price', 'min', 'min_item_price'),
    (['item_id'], 'sell_price', 'std', 'std_item_price'),

    (['year', 'item_id'], 'sell_price', 'mean', 'mean_yr_item_price'),
    (['year', 'item_id'], 'sell_price', 'max', 'max_yr_item_price'),
    (['year', 'item_id'], 'sell_price', 'min', 'min_yr_item_price'),
    (['year', 'item_id'], 'sell_price', 'std', 'std_yr_item_price'),

    (['month', 'item_id'], 'sell_price', 'mean', 'mean_m_item_price'),
    (['month', 'item_id'], 'sell_price', 'max', 'max_m_item_price'),
    (['month', 'item_id'], 'sell_price', 'min', 'min_m_item_price'),
    (['month', 'item_id'], 'sell_price', 'std', 'std_m_item_price'),

    (['wm_yr_wk', 'item_id'], 'sell_price', 'mean', 'mean_w_item_price'),
    (['wm_yr_wk', 'item_id'], 'sell_price', 'max', 'max_w_item_price'),
    (['wm_yr_wk', 'item_id'], 'sell_price', 'min', 'min_w_item_price'),
    (['wm_yr_wk', 'item_id'], 'sell_price', 'std', 'std_w_item_price'),
]

# Apply all aggregations
for group_by_cols, agg_col, agg_func, new_col_name in aggregations:
    sales_train4 = add_aggregated_column(sales_train4, group_by_cols, agg_col, agg_func, new_col_name)


In [25]:
sales_train4.loc[
    :, ["mean_store_item_price", 
        "min_store_item_price", "max_store_item_price", 
        "std_store_item_price", "sell_price"]
][sales_train4['sell_price'].notnull()].head()

Unnamed: 0,mean_store_item_price,min_store_item_price,max_store_item_price,std_store_item_price,sell_price
7,0.476222,0.42,0.5,0.019962,0.46
8,1.76466,1.56,1.77,0.033063,1.56
9,2.981624,2.97,3.17,0.046801,3.17
13,4.522304,4.34,4.64,0.146499,4.34
18,1.76466,1.56,1.77,0.033063,1.56


In [26]:
sales_train4.loc[
    :, ["mean_yr_store_item_price", 
        "min_yr_store_item_price", "max_yr_store_item_price", 
        "std_yr_store_item_price", "sell_price"]
][sales_train4['sell_price'].notnull()].head()

Unnamed: 0,mean_yr_store_item_price,min_yr_store_item_price,max_yr_store_item_price,std_yr_store_item_price,sell_price
7,0.484214,0.42,0.5,0.030255,0.46
8,1.739466,1.56,1.77,0.074081,1.56
9,3.036469,2.97,3.17,0.094281,3.17
13,4.34,4.34,4.34,0.0,4.34
18,1.739466,1.56,1.77,0.074081,1.56


In [27]:
sales_train4.loc[
    :, ["mean_m_store_item_price", 
        "min_m_store_item_price", "max_m_store_item_price", 
        "std_m_store_item_price", "sell_price"]
][sales_train4['sell_price'].notnull()].head()

Unnamed: 0,mean_m_store_item_price,min_m_store_item_price,max_m_store_item_price,std_m_store_item_price,sell_price
7,0.476329,0.46,0.5,0.017579,0.46
8,1.766013,1.56,1.77,0.028706,1.56
9,2.973797,2.97,3.17,0.027339,3.17
13,4.516582,4.34,4.64,0.14786,4.34
18,1.766013,1.56,1.77,0.028706,1.56


In [None]:
aaa = pd.DataFrame([[2001, 1, 100],
                    [2001, 1, 200],
                    [2001, 2, 300],
                    [2002, 1, 100],
                    [2002, 1, 200],
                    [2002, 2, 300],
                    ], columns=["year","month","price"])
aaa.groupby(['year']).agg({"price":"mean"})