In [76]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [77]:
calendar = pd.read_csv('../data/calendar.csv')
sales_train_val = pd.read_csv('../data/sales_train_validation.csv')
sales_train_eval = pd.read_csv('../data/sales_train_evaluation.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')

sales_train_val = sales_train_val[sales_train_val['item_id']=='HOBBIES_1_001']
sales_train_eval = sales_train_eval[sales_train_eval['item_id']=='HOBBIES_1_001']

### 01 Preprocessing Data
#### 1.1 Calendar Data

In [79]:
display(calendar.info())
display(calendar.head().T)
display(calendar.describe().T)
display(calendar.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


None

Unnamed: 0,0,1,2,3,4
date,2011-01-29,2011-01-30,2011-01-31,2011-02-01,2011-02-02
wm_yr_wk,11101,11101,11101,11101,11101
weekday,Saturday,Sunday,Monday,Tuesday,Wednesday
wday,1,2,3,4,5
month,1,1,1,2,2
year,2011,2011,2011,2011,2011
d,d_1,d_2,d_3,d_4,d_5
event_name_1,,,,,
event_type_1,,,,,
event_name_2,,,,,


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wm_yr_wk,1969.0,11347.086338,155.277043,11101.0,11219.0,11337.0,11502.0,11621.0
wday,1969.0,3.997461,2.001141,1.0,2.0,4.0,6.0,7.0
month,1969.0,6.325546,3.416864,1.0,3.0,6.0,9.0,12.0
year,1969.0,2013.288471,1.580198,2011.0,2012.0,2013.0,2015.0,2016.0
snap_CA,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0
snap_TX,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0
snap_WI,1969.0,0.330117,0.470374,0.0,0.0,0.0,1.0,1.0


date               0
wm_yr_wk           0
weekday            0
wday               0
month              0
year               0
d                  0
event_name_1    1807
event_type_1    1807
event_name_2    1964
event_type_2    1964
snap_CA            0
snap_TX            0
snap_WI            0
dtype: int64

In [80]:
# Convert 'date' column to datetime format
calendar['date'] = pd.to_datetime(calendar['date'])
# drop weekday 
calendar = calendar.drop(columns=['weekday']) 

In [81]:
# define a function to do the following steps for columns: event_name_1, event_type_1, event_name_2, event_type_2
def process_event_column(column, calendardata):
    # Get unique values for the event_name column and the associated date
    temp_df = calendardata[[column, 'date']].dropna().sort_values(by=[column, 'date']).reset_index(drop=True)
    
    # Pivot the data: event names become columns, and dates remain as index
    temp_df = temp_df.pivot(index='date', columns=column, values=column)
    
    # Fill NaN values with 0, and existing values with 1
    temp_df = temp_df.fillna(0).applymap(lambda x: 1 if x != 0 else 0)

    # Merge the resulting pivoted DataFrame back to the original calendar data by date
    calendardata = calendardata.merge(temp_df, on='date', how='left')

    # Drop the original column after merge
    calendardata = calendardata.drop(columns=[column])

    # Return the updated calendar data and the new column names
    return calendardata, temp_df.columns.tolist()


# apply the function to the columns
calendar, event_name_1_columns = process_event_column(column='event_name_1', calendardata=calendar)
calendar, event_type_1_columns = process_event_column(column='event_type_1', calendardata=calendar)
calendar, event_name_2_columns = process_event_column(column='event_name_2', calendardata=calendar)
calendar, event_type_2_columns = process_event_column(column='event_type_2', calendardata=calendar)

print("event_name_1_columns: ", event_name_1_columns)
print("event_type_1_columns: ", event_type_1_columns)
print("event_name_2_columns: ", event_name_2_columns)
print("event_type_2_columns: ", event_type_2_columns)

event_name_1_columns:  ['Chanukah End', 'Christmas', 'Cinco De Mayo', 'ColumbusDay', 'Easter', 'Eid al-Fitr', 'EidAlAdha', "Father's day", 'Halloween', 'IndependenceDay', 'LaborDay', 'LentStart', 'LentWeek2', 'MartinLutherKingDay', 'MemorialDay', "Mother's day", 'NBAFinalsEnd', 'NBAFinalsStart', 'NewYear', 'OrthodoxChristmas', 'OrthodoxEaster', 'Pesach End', 'PresidentsDay', 'Purim End', 'Ramadan starts', 'StPatricksDay', 'SuperBowl', 'Thanksgiving', 'ValentinesDay', 'VeteransDay']
event_type_1_columns:  ['Cultural', 'National', 'Religious', 'Sporting']
event_name_2_columns:  ['Cinco De Mayo', 'Easter', "Father's day", 'OrthodoxEaster']
event_type_2_columns:  ['Cultural', 'Religious']


In [83]:
# calendar.loc[:,event_name_1_columns].fillna(0).applymap(lambda x: 1 if x > 0 else 0)

# event_name_1_columns

calendar.head().T


Unnamed: 0,0,1,2,3,4
date,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,2011-02-02 00:00:00
wm_yr_wk,11101,11101,11101,11101,11101
wday,1,2,3,4,5
month,1,1,1,2,2
year,2011,2011,2011,2011,2011
d,d_1,d_2,d_3,d_4,d_5
snap_CA,0,0,0,1,1
snap_TX,0,0,0,1,0
snap_WI,0,0,0,0,1
Chanukah End,,,,,


In [92]:
calendar.columns

Index(['date', 'wm_yr_wk', 'wday', 'month', 'year', 'd', 'snap_CA', 'snap_TX',
       'snap_WI', 'Chanukah End', 'Christmas', 'Cinco De Mayo_x',
       'ColumbusDay', 'Easter_x', 'Eid al-Fitr', 'EidAlAdha', 'Father's day_x',
       'Halloween', 'IndependenceDay', 'LaborDay', 'LentStart', 'LentWeek2',
       'MartinLutherKingDay', 'MemorialDay', 'Mother's day', 'NBAFinalsEnd',
       'NBAFinalsStart', 'NewYear', 'OrthodoxChristmas', 'OrthodoxEaster_x',
       'Pesach End', 'PresidentsDay', 'Purim End', 'Ramadan starts',
       'StPatricksDay', 'SuperBowl', 'Thanksgiving', 'ValentinesDay',
       'VeteransDay', 'Cultural_x', 'National', 'Religious_x', 'Sporting',
       'Cinco De Mayo_y', 'Easter_y', 'Father's day_y', 'OrthodoxEaster_y',
       'Cultural_y', 'Religious_y'],
      dtype='object')

#### 1.2 Sell Prices Data
The store and item IDs together with the sales price of the item as a weekly average.

In [39]:
display(sell_prices.info()) 
display(sell_prices.head()) 
display(sell_prices.describe().T)
display(sell_prices.isnull().sum().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB


None

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wm_yr_wk,6841121.0,11382.943423,148.610026,11101.0,11247.0,11411.0,11517.0,11621.0
sell_price,6841121.0,4.410952,3.408814,0.01,2.18,3.47,5.84,107.32


store_id      0
item_id       0
wm_yr_wk      0
sell_price    0
dtype: int64

#### 1.3 Sales_train_validation

In [None]:
display(sales_train_val.info())
display(sales_train_val.head())
display(sales_train_val.describe().T)
display(sales_train_val.isnull().sum().T)

In [None]:
# Pivot the dataframe d1 to d1913 to rows
sales_train_val_T = sales_train_val.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_val.columns[sales_train_val.columns.get_loc('d_1'):sales_train_val.columns.get_loc('d_1913')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_val_T.head())
display(sales_train_val_T.tail())

#### 1.4 Sales_train_evaluation

In [None]:
display(sales_train_eval.info())
display(sales_train_eval.head())
display(sales_train_eval.describe().T)
display(sales_train_eval.isnull().sum().T)


In [None]:
# Pivot the dataframe d1 to d1913 to rows
sales_train_eval_T = sales_train_eval.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_eval.columns[sales_train_eval.columns.get_loc('d_1'):sales_train_eval.columns.get_loc('d_1941')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_eval_T.head())
display(sales_train_eval_T.tail())

#### 1.5 Sales_train_validation and Sales_train_evaluation

In [None]:
print("sales_train_val_T.shape: ", sales_train_val_T.shape)
print("sales_train_eval_T.shape: ", sales_train_eval_T.shape)


In [None]:
# Concatenate sales_train_val_T and sales_train_eval_T
sales_train = pd.concat([sales_train_val_T, sales_train_eval_T], ignore_index=True)
print("sales_train.shape: ", sales_train.shape)


In [None]:
# extract the day number from the 'd' column
sales_train['day'] = sales_train['d'].str.extract('d_(\d+)').astype(int)
display(sales_train.head())
display(sales_train.tail())

#### 1.6 Merge sales_train with Calendar + Sell_prices


In [None]:
# merge sales_train with calendar on 'd' + sell_prices on 'wm_yr_wk'
sales_train2 = sales_train.merge(calendar, on='d').merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'])
# drop the 'd' column
sales_train = sales_train2.drop(columns=['d'])
# calculate daily revenue n 
sales_train['revenue'] = sales_train['sales'] * sales_train['sell_price']
sales_train['']
display(sales_train.head().T)


### 02 EDA + Feature Engineering
#### 2.1 Sales Features
##### 2.1.1 EDA

In [None]:


sales_train['annual_sales']= sales_train['sales'].groupby(sales_train['year']).transform('sum')

# # Create the bar plot for sales over the years
plt.bar(sales_train['year'], sales_train['annual_sales'], color='lightblue', edgecolor='black', label='Sales')

# # Overlay a line plot to show the trend
plt.plot(sales_train['year'], sales_train['annual_sales'], color='red', marker='o', label='Trend')

# Set x-ticks to be integers
plt.xticks(sales_train['year'].unique())

# # Add labels and title
plt.xlabel('Year')
plt.ylabel('Anual Sales')
plt.title('Sales Over the Years')

In [None]:
def plot_sales_bytime(period='year'):
    #set up plot size
    plt.figure(figsize=(16, 8))
    
    temp = sales_train['sales'].groupby(sales_train[period]).sum().reset_index()
    print(temp)

    # Create the bar plot for sales over the years
    plt.bar(temp[period], temp['sales'], color='lightblue', edgecolor='black', label='Sales')

    # Overlay a line plot to show the trend
    plt.plot(temp[period], temp['sales'], color='red', marker='o', label='Trend')

    # # Set x-ticks to be integers
    plt.xticks(temp[period].unique())

    # # # Add labels and title
    plt.xlabel(period)
    plt.ylabel(period+'Sales')
    plt.title('Sales Over the ' + period.capitalize())

plot_sales_bytime(period='year')

In [None]:
plot_sales_bytime(period='month')

In [None]:
plot_sales_bytime(period='wm_yr_wk')

In [None]:
sales_train[(11325 <sales_train['wm_yr_wk']) & (sales_train['wm_yr_wk'] < 11520)]['wm_yr_wk'].unique()

In [None]:
sales_train[(sales_train['year']==2014) & (sales_train['wm_yr_wk'] != 11349)]['wm_yr_wk'].min()

In [None]:
plot_sales_bytime(period='wday')

In [None]:
plot_sales_bytime(period='wday')

In [None]:
plot_sales_bytime(period='day')

##### 2.1.2 Feature Engineering

In [None]:
sales_train.head().T