In [1]:
%pip install -Uq upgini
import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
# visualization tools
from matplotlib import pyplot as plt, style
style.use('seaborn-darkgrid')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
from tqdm import tqdm

import gc
gc.enable()
from warnings import filterwarnings, simplefilter
filterwarnings('ignore')
simplefilter('ignore')

In [29]:
train_start = "2013-01-01"
train_end = "2017-08-15"
test_start = "2017-08-16"
test_end = "2017-08-31"

train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv',
                    parse_dates = ['date'], infer_datetime_format = True,
                    dtype = {'store_nbr' : 'category',
                             'family' : 'category'},
                    usecols = ['date', 'store_nbr', 'family', 'sales'])
#train['date'] = train.date.dt.to_period('D')
train = train.set_index(['date', 'store_nbr', 'family']).sort_index()
print(train.shape)
train.head()

In [30]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv',
                   parse_dates = ['date'], infer_datetime_format = True)
#test['date'] = test.date.dt.to_period('D')
test = test.set_index(['date', 'store_nbr', 'family']).sort_values('id')
print(test.shape)
test.head()

In [31]:
calendar = pd.DataFrame(index = pd.date_range('2013-01-01', '2017-08-31'))
oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv',
                  parse_dates = ['date'], infer_datetime_format = True,
                  index_col = 'date')
oil['avg_oil'] = oil['dcoilwtico'].rolling(7).mean()
calendar = calendar.join(oil.avg_oil)
calendar['avg_oil'].fillna(method = 'ffill', inplace = True)
calendar.dropna(inplace = True)

In [34]:
df_oil = pd.DataFrame(index=pd.date_range(train_start, test_end), columns=["dcoilwtico", "avg_oil"])
df_oil.loc[train_start:test_end, :] = oil.loc[train_start:test_end]
df_oil = df_oil.astype(np.float64).interpolate()
df_oil.tail()

In [33]:
import statsmodels.api as sm

st = "2013-01-10"
en = "2017-08-15"

y_oil = df_oil.loc[st:ed]["dcoilwtico"]
data = train.groupby(["date", "family"]).mean()
familys = data.index.get_level_values("family").unique()
plot_size = 30

for family in familys:
    # salesデータはクリスマスが欠損してるので補完する
    a = pd.DataFrame(data.query(" family==@family ").sales.to_numpy(), index=pd.to_datetime(data.query(" family==@family ").index.get_level_values("date")), columns=["sales"])
    df_new = pd.DataFrame(index=pd.date_range(st, ed), columns=["sales"])
    df_new.loc[a.loc[st:ed].index, :] = a.loc[st:ed]
    df_new["sales"] = df_new["sales"].fillna(0) # クリスマスは売り上げ0にしておく
    y_family = df_new["sales"].to_numpy().reshape(-1, )
    ccf_xy = sm.tsa.ccf(y_family, y_oil)[1:plot_size+1]
    ccf_yx = sm.tsa.ccf(y_oil, y_family)[:plot_size]
    ccf = np.concatenate([ccf_yx[::-1], ccf_xy])
    
    x_axis = np.arange(-plot_size, plot_size)
    fig = plt.figure(figsize=(6, 3), dpi=120)
    ax = fig.add_subplot(111)
    ax.stem(x_axis, ccf)
    ax.set_xlim([-plot_size, plot_size])
    ax.set_ylim([-1, 1])
    ax.set_title(f"{family} vs Oil")
    plt.show()

In [36]:
# oilデータと特徴量に追加
add_f = pd.DataFrame(index=pd.date_range(train_start, test_end), columns=["dcoilwtico", "avg_oil"])
add_f.loc[train_start:test_end, :] = df_oil.loc[train_start:test_end]
add_f.head()