In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
calendar = pd.read_csv('../data/calendar.csv')
sales_train_val = pd.read_csv('../data/sales_train_validation.csv')
sales_train_eval = pd.read_csv('../data/sales_train_evaluation.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')

### 01 Preprocessing Data
#### 1.1 Calendar Data

In [None]:
display(calendar.info())
display(calendar.head().T)
display(calendar.describe().T)
display(calendar.isnull().sum().T)

In [None]:
# Convert 'date' column to datetime format
calendar['date'] = pd.to_datetime(calendar['date'])
# drop weekday 
calendar = calendar.drop(columns=['weekday']) 

In [None]:
# define a function to do the following steps for columns: event_name_1, event_type_1, event_name_2, event_type_2
def process_event_column(column, calendardata):
    # Get unique values for the event_name column and the associated date
    temp_df = calendardata[[column, 'date']].dropna().sort_values(by=[column, 'date']).reset_index(drop=True)
    
    # Pivot the data: event names become columns, and dates remain as index
    temp_df = temp_df.pivot(index='date', columns=column, values=column)
    
    # Fill NaN values with 0, and existing values with 1
    temp_df = temp_df.fillna(0).applymap(lambda x: 1 if x != 0 else 0)
    # Merge the resulting pivoted DataFrame back to the original calendar data by date
    calendardata = calendardata.merge(temp_df, on='date', how='left')
    # Drop the original column after merge
    calendardata = calendardata.drop(columns=[column])
    # Return the updated calendar data and the new column names
    return calendardata, temp_df.columns.tolist()


# apply the function to the columns
calendar, event_name_1_columns = process_event_column(column='event_name_1', calendardata=calendar)
calendar, event_type_1_columns = process_event_column(column='event_type_1', calendardata=calendar)
calendar, event_name_2_columns = process_event_column(column='event_name_2', calendardata=calendar)
calendar, event_type_2_columns = process_event_column(column='event_type_2', calendardata=calendar)

# print("event_name_1_columns: ", event_name_1_columns)
# print("event_type_1_columns: ", event_type_1_columns)
# print("event_name_2_columns: ", event_name_2_columns)
# print("event_type_2_columns: ", event_type_2_columns)

#### 1.2 Sell Prices Data
The store and item IDs together with the sales price of the item as a weekly average.

In [None]:
display(sell_prices.info())
display(sell_prices.head())
display(sell_prices.describe().T)
display(sell_prices.isnull().sum().T)

#### 1.3 Sales_train_validation

In [None]:
display(sales_train_val.info())
display(sales_train_val.head())
display(sales_train_val.describe().T)
display(sales_train_val.isnull().sum().T)

In [None]:
# Pivot the dataframe d1 to d1913 to rows
sales_train_val_T = sales_train_val.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_val.columns[sales_train_val.columns.get_loc('d_1'):sales_train_val.columns.get_loc('d_1913')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_val_T.head())
display(sales_train_val_T.tail())

#### 1.4 Sales_train_evaluation

In [None]:
display(sales_train_eval.info())
display(sales_train_eval.head())
display(sales_train_eval.describe().T)
display(sales_train_eval.isnull().sum().T)


In [None]:
# Pivot the dataframe d1 to d1913 to rows
sales_train_eval_T = sales_train_eval.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=sales_train_eval.columns[sales_train_eval.columns.get_loc('d_1'):sales_train_eval.columns.get_loc('d_1941')+1],
    var_name='d',
    value_name='sales'
)
display(sales_train_eval_T.head())
display(sales_train_eval_T.tail())

#### 1.5 Sales_train_validation and Sales_train_evaluation

In [None]:
print("sales_train_val_T.shape: ", sales_train_val_T.shape)
print("sales_train_eval_T.shape: ", sales_train_eval_T.shape)


In [None]:
# Concatenate sales_train_val_T and sales_train_eval_T
sales_train = pd.concat([sales_train_val_T, sales_train_eval_T], ignore_index=True)
print("sales_train.shape: ", sales_train.shape)


In [None]:
# extract the day number from the 'd' column
sales_train['day'] = sales_train['d'].str.extract('d_(\d+)').astype(int)
display(sales_train.head())
display(sales_train.tail())

#### 1.6 Merge sales_train with Calendar + Sell_prices


In [14]:
# merge sales_train with calendar on 'd' + sell_prices on 'wm_yr_wk'
sales_train2 = sales_train.merge(calendar, on='d').merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'])
# drop the 'd' column
sales_train = sales_train2.drop(columns=['d'])
# calculate daily revenue n 
sales_train['revenue'] = sales_train['sales'] * sales_train['sell_price']
display(sales_train.head().T)


### 02 EDA + Feature Engineering
#### 2.1 Sales Features
##### 2.1.1 EDA

In [None]:
aaa = sales_train['sales'].groupby(sales_train['year']).sum()

In [None]:
sales_train['year'].unique()