In [1]:
# "magic commands" to enable autoreload of the imported packages
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2

import os

# Data Manipulation
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 500
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.precision 

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Default import behaviour
%matplotlib inline

import pickle
import warnings

#settings
warnings.filterwarnings("ignore")

In [2]:
# Load data
train_df = pd.read_csv("../raw_data/train.csv")
test_df = pd.read_csv("../raw_data/test.csv")

stores_df = pd.read_csv("../raw_data/stores.csv")
transactions_df = pd.read_csv("../raw_data/transactions.csv")
holiday_events_df = pd.read_csv("../raw_data/holidays_events.csv")
oil_df = pd.read_csv("../raw_data/oil.csv")

# Make copies
train_df = train_df.copy()
test_df = test_df.copy()

stores_df = stores_df.copy()
transactions_df = transactions_df.copy()
holiday_events_df = holiday_events_df.copy()
oil_df = oil_df.copy()

In [3]:
# Convert the date columns to datetime 
train_df["date"] = pd.to_datetime(train_df["date"])
test_df['date'] = pd.to_datetime(test_df['date'])
transactions_df["date"] = pd.to_datetime(transactions_df["date"])
holiday_events_df['date'] = pd.to_datetime(holiday_events_df['date'])
oil_df['date'] = pd.to_datetime(oil_df['date'])

In [4]:
test_df = test_df.drop(columns=["id"])
train_df = train_df.drop(columns=["id"])

## train_df and transactions_df

In [5]:
# Transactions will be one of the features
sales_transactions_df = pd.merge(train_df.groupby(["date", "store_nbr"]).sales.sum().reset_index(), 
                                 transactions_df, 
                                 how = "left")
sales_transactions_df 

Unnamed: 0,date,store_nbr,sales,transactions
0,2013-01-01,1,0.00,
1,2013-01-01,2,0.00,
2,2013-01-01,3,0.00,
3,2013-01-01,4,0.00,
4,2013-01-01,5,0.00,
...,...,...,...,...
90931,2017-08-15,50,16879.12,2804.00
90932,2017-08-15,51,20154.56,1573.00
90933,2017-08-15,52,18600.05,2255.00
90934,2017-08-15,53,8208.19,932.00


In [6]:
sales_transactions_df = sales_transactions_df.copy()
sales_transactions_df["year"] = sales_transactions_df.date.dt.year
sales_transactions_df["month"] = sales_transactions_df.date.dt.month
sales_transactions_df.head(3)

Unnamed: 0,date,store_nbr,sales,transactions,year,month
0,2013-01-01,1,0.0,,2013,1
1,2013-01-01,2,0.0,,2013,1
2,2013-01-01,3,0.0,,2013,1


In [7]:
# Check the average monthly transactions
average_monthly_transactions_df = transactions_df.set_index("date").resample("M")[["transactions"]].mean().reset_index()
average_monthly_transactions_df["year"] = average_monthly_transactions_df.date.dt.year
average_monthly_transactions_df.head(3)
#.resample(): Resample time-series data. Convenience method for frequency conversion and resampling of time series. 
# Here, we get the average of transactions for each month

Unnamed: 0,date,transactions,year
0,2013-01-31,1657.9,2013
1,2013-02-28,1684.48,2013
2,2013-03-31,1724.18,2013


In [8]:
all_=pd.merge(sales_transactions_df, stores_df, how="left")

## holiday_events_df

In [9]:
holiday_events_df = holiday_events_df.drop(columns=['description', 'transferred'])
holiday_events_df.head(3)

Unnamed: 0,date,type,locale,locale_name
0,2012-03-02,Holiday,Local,Manta
1,2012-04-01,Holiday,Regional,Cotopaxi
2,2012-04-12,Holiday,Local,Cuenca


## oil_df

In [10]:
print(oil_df.shape)
oil_df = oil_df.rename(columns={'dcoilwtico': 'oil_price'})
oil_df.head(3)

(1218, 2)


Unnamed: 0,date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97


In [11]:
# DEAL WITH MISSING VALUES - OPTION 1: interpolate

oil_df = oil_df.set_index("date").oil_price.resample("D").sum().reset_index()
print(oil_df.shape)
oil_df.head(10)

(1704, 2)


Unnamed: 0,date,oil_price
0,2013-01-01,0.0
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,0.0
5,2013-01-06,0.0
6,2013-01-07,93.2
7,2013-01-08,93.21
8,2013-01-09,93.08
9,2013-01-10,93.81


In [12]:
# DEAL WITH MISSING VALUES - OPTION 1: interpolate
# 2-interpolate
oil_df["oil_price"] = np.where(oil_df["oil_price"] == 0, np.nan, oil_df["oil_price"])
#np.where(condition, [x,y,]): Return elements chosen from `x` or `y`= depending on `condition`.
#(returns an array with elements from x where condition is True, and elements from y elsewhere.)

oil_df["oil_price_interpolated"] = oil_df.oil_price.interpolate()
# .interpolate: Fill NaN values using an interpolation method (method='linear')

oil_df

Unnamed: 0,date,oil_price,oil_price_interpolated
0,2013-01-01,,
1,2013-01-02,93.14,93.14
2,2013-01-03,92.97,92.97
3,2013-01-04,93.12,93.12
4,2013-01-05,,93.15
...,...,...,...
1699,2017-08-27,,46.82
1700,2017-08-28,46.40,46.40
1701,2017-08-29,46.46,46.46
1702,2017-08-30,45.96,45.96


In [13]:
# DEAL WITH MISSING VALUES - OPTION 1: interpolate
# 4-plot
p = oil_df.melt(id_vars=['date']+list(oil_df.keys()[5:]), var_name='Legend')
px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend',title = "Daily Oil Price" )

# merge datasets

### Merge train_df with store_df, holiday_events_df, and oil_df

In [14]:
# 1 - Merge train_df and store_df
merged_df = pd.merge(train_df, stores_df, how="left", on="store_nbr")    
merged_df.head(2)

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13


In [15]:
# 2 - Merge train_df, store_df, holiday_events_df
merged_df = pd.merge(merged_df, holiday_events_df, how="left", 
                     left_on="date", right_on="date")  
merged_df = merged_df.rename(columns = {"type_x": "type_stores", "type_y": "type_holiday"})
merged_df.head(2)

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,Holiday,National,Ecuador
1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,Holiday,National,Ecuador


In [16]:
# 3 - Merge train_df, store_df, holiday_events_df, oil_df
merged_df = pd.merge(merged_df, oil_df, how="left", left_on="date", right_on="date")    

In [17]:
# Replace missing values (NaN) 
merged_df["type_holiday"] = merged_df["type_holiday"].replace(np.nan, "Not Holiday")
merged_df["locale"] = merged_df["locale"].replace(np.nan, "Not Holiday")
merged_df["locale_name"] = merged_df["locale_name"].replace(np.nan, "Not Holiday")

# NOT SURE ABOUT OIL_PRICE!!! replace with 0 for now
merged_df["oil_price"] = merged_df["oil_price"].replace(np.nan, 0)
merged_df["oil_price_interpolated"] = merged_df["oil_price"].replace(np.nan, 0)

### Merge merged_df with transactions_df

In [18]:

# Convert cluster datatype from int to object
merged_df["cluster"] = merged_df["cluster"].astype('object')

In [19]:
merged_df

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated
0,2013-01-01,1,AUTOMOTIVE,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
1,2013-01-01,1,BABY CARE,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
2,2013-01-01,1,BEAUTY,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
3,2013-01-01,1,BEVERAGES,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
4,2013-01-01,1,BOOKS,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,2017-08-15,9,POULTRY,438.13,0,Quito,Pichincha,B,6,Holiday,Local,Riobamba,47.57,47.57
3054344,2017-08-15,9,PREPARED FOODS,154.55,1,Quito,Pichincha,B,6,Holiday,Local,Riobamba,47.57,47.57
3054345,2017-08-15,9,PRODUCE,2419.73,148,Quito,Pichincha,B,6,Holiday,Local,Riobamba,47.57,47.57
3054346,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.00,8,Quito,Pichincha,B,6,Holiday,Local,Riobamba,47.57,47.57


#### Option 1: Group merged_df by store_nbr and date >> merge with transactions_df

In [20]:
def join_unique_values(arr):
    return ', '.join(np.unique(arr.astype(str)))

# Apply updated custom aggregation function to categorical columns in merged_stores_df DataFrame
merged_stores_df = merged_df.groupby(["store_nbr", "date"]).agg({'sales': np.sum,
                                                                 'onpromotion': np.sum, 
                                                                 'city': join_unique_values, 
                                                                 'state': join_unique_values, 
                                                                 'type_stores': join_unique_values, 
                                                                 'cluster': join_unique_values, 
                                                                 'type_holiday': join_unique_values, 
                                                                 'locale': join_unique_values, 
                                                                 'locale_name': join_unique_values, 
                                                                 'oil_price':'mean', 
                                                                 'oil_price_interpolated':'mean'})


In [21]:
merged_stores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated
store_nbr,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2013-01-01,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00
1,2013-01-02,7417.15,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.14,93.14
1,2013-01-03,5873.24,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,92.97,92.97
1,2013-01-04,5919.88,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.12,93.12
1,2013-01-05,6318.79,0,Quito,Pichincha,D,13,Work Day,National,Ecuador,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
54,2017-08-11,8513.83,166,El Carmen,Manabi,C,3,Transfer,National,Ecuador,48.81,48.81
54,2017-08-12,9139.68,138,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00
54,2017-08-13,14246.83,173,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00
54,2017-08-14,11882.99,126,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,47.59,47.59


In [22]:
# 2-Merge merged_df and transactions_df (the train dataset is grouped by store_nbr and date)
all_df = pd.merge(merged_stores_df.reset_index(), 
                  transactions_df, 
                  how="left")
all_df

Unnamed: 0,store_nbr,date,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated,transactions
0,1,2013-01-01,0.00,0,Quito,Pichincha,D,13,Holiday,National,Ecuador,0.00,0.00,
1,1,2013-01-02,7417.15,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.14,93.14,2111.00
2,1,2013-01-03,5873.24,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,92.97,92.97,1833.00
3,1,2013-01-04,5919.88,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.12,93.12,1863.00
4,1,2013-01-05,6318.79,0,Quito,Pichincha,D,13,Work Day,National,Ecuador,0.00,0.00,1509.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,54,2017-08-11,8513.83,166,El Carmen,Manabi,C,3,Transfer,National,Ecuador,48.81,48.81,768.00
90932,54,2017-08-12,9139.68,138,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,903.00
90933,54,2017-08-13,14246.83,173,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,1054.00
90934,54,2017-08-14,11882.99,126,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,47.59,47.59,818.00


In [23]:
# drop rows with NA 
all_df.dropna(subset=['transactions'], inplace=True)

In [24]:
all_df

Unnamed: 0,store_nbr,date,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated,transactions
1,1,2013-01-02,7417.15,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.14,93.14,2111.00
2,1,2013-01-03,5873.24,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,92.97,92.97,1833.00
3,1,2013-01-04,5919.88,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.12,93.12,1863.00
4,1,2013-01-05,6318.79,0,Quito,Pichincha,D,13,Work Day,National,Ecuador,0.00,0.00,1509.00
5,1,2013-01-06,2199.09,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,520.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,54,2017-08-11,8513.83,166,El Carmen,Manabi,C,3,Transfer,National,Ecuador,48.81,48.81,768.00
90932,54,2017-08-12,9139.68,138,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,903.00
90933,54,2017-08-13,14246.83,173,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,1054.00
90934,54,2017-08-14,11882.99,126,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,47.59,47.59,818.00


In [25]:
# make pickle file for OPTION 1
all_df.to_pickle(f'../models/all_df.pickle')