In [None]:
import pandas as pd
import numpy as np
import recruit_utils

import pdvega
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf, kpss
from sklearn import cluster

In [None]:
%matplotlib inline

In [None]:
# Import data into pandas data frames
data, stores = recruit_utils.import_data()

In [None]:
# Transform data and stores into train and test data frames
train, test = recruit_utils.create_train_test(data, stores)

In [None]:
# Inspect the training data
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
# Inspect the test data
test.head()

In [None]:
test.shape

In [None]:
test.describe()

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(12, 6))
sns.distplot(train['visitors'], ax=axs[0])
sns.distplot(train['log_visitors'], ax=axs[1])

In [None]:
# Define list of main predictor variables for visualizations
categorical_vars = ['dow', 'wom', 'year', 'month', 'day', 'day_of_week', 'holiday_flg', 
                    'air_genre_name', 'air_area_name', 'air_store_id2', 'cluster']

numeric_vars = ['min_visitors', 'mean_visitors', 'median_visitors', 'max_visitors',
                'count_observations', 'rs1_x', 'rv1_x', 'rs2_x', 'rv2_x', 'rs1_y',
                'rv1_y', 'rs2_y', 'rv2_y', 'total_reserv_sum', 'total_reserv_mean',
                'total_reserv_dt_diff_mean']

In [None]:
target = ['visitors', 'log_visitors', 'visitor_diff', 'log_visitor_diff']
for y in target:
    subset = target + numeric_vars
    sns.pairplot(train[subset], kind="reg")
    plt.show()

In [None]:
train[subset].head()

In [None]:
tmp1 = train.copy()
tmp1['data_set'] = 'Train'
tmp2 = test.copy()
tmp2['data_set'] = 'Test'
combined = pd.concat([tmp1, tmp2])
del(tmp1)
del(tmp2)

In [None]:
for i, col in enumerate(plot_df.columns):
    plt.figure(i)
    sns.countplot(x=col, data=plot_df)

In [None]:
categorical_vars = ['year', 'month', 'holiday_flg', 'air_genre_name', 'air_area_name', 'cluster']

In [None]:
for var in categorical_vars:
    plt.figure()
    sns.factorplot(x=var, col='data_set', data=combined, kind="count")

In [None]:
train.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
train.plot(kind="scatter", x="longitude", y="latitude", 
                              alpha=0.4, s=train["mean_visitors"], 
                              label="# of visitors", c=train["mean_visitors"], 
                              cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

In [None]:
test.plot(kind="scatter", x="longitude", y="latitude", 
                              alpha=0.4)
plt.legend()

In [None]:
ts = pd.DataFrame(train[['visit_date', 'air_area_name', 'air_genre_name', 
                         'cluster', 'day_of_week', 'holiday_flg', 'visitors', 'visitor_diff', 'log_visitors', 'log_visitor_diff']])
ts = ts.set_index('visit_date')
ts.index = ts.index.to_datetime()
ts.index.name = 'visit_date'

In [None]:
ts = ts.sort_index()

In [None]:
for area in np.unique(ts.air_area_name):
    plt.plot(ts[ts.air_area_name == area]['visitors'])

plt.show()

In [None]:
for genre in np.unique(ts.air_genre_name):
    plt.plot(ts[ts.air_genre_name == genre]['visitors'], label = genre)
    
plt.show()

In [None]:
for clus in np.unique(ts.cluster):
    plt.plot(ts[ts.cluster == clus]['visitors'], label = clus)
    
plt.legend(loc='best')
plt.show()

In [None]:
for dow in np.unique(ts.day_of_week):
    plt.plot(ts[ts.day_of_week == dow]['visitors'], label = dow)
    
plt.legend(loc='best')
plt.show()

In [None]:
for hol in np.unique(ts.holiday_flg):
    plt.plot(ts[ts.holiday_flg == hol]['visitors'], label = hol)
    
plt.legend(loc='best')
plt.show()

In [None]:
ts = ts.groupby(ts.index).mean()
ts_dates = ts.index.to_series().as_matrix()
ts_visitors = ts.log_visitors.as_matrix()

window_size = 16
window = np.ones(window_size)/float(window_size)
ts_avg = np.convolve(ts_visitors, window, 'same')

# create a new plot with a a datetime axis type
p = figure(width=800, height=350, x_axis_type="datetime")

# add renderers
p.circle(ts_dates, ts_visitors, 
         size=4, color='darkgrey', alpha=0.2, legend='close')
p.line(ts_dates, ts_avg, color='navy', legend='avg')

# NEW: customize by setting attributes
p.title.text = "Visitors 16 Day Average"
p.legend.location = "top_left"
p.grid.grid_line_alpha=0
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Visitors'
p.ygrid.band_fill_color="olive"
p.ygrid.band_fill_alpha = 0.1

# show the results
show(p)

In [None]:
ts = ts.groupby(ts.index).mean()
ts_dates = ts.index.to_series().as_matrix()
ts_visitors = ts.log_visitor_diff.as_matrix()

window_size = 16
window = np.ones(window_size)/float(window_size)
ts_avg = np.convolve(ts_visitors, window, 'same')

# create a new plot with a a datetime axis type
p = figure(width=800, height=350, x_axis_type="datetime")

# add renderers
p.circle(ts_dates, ts_visitors, 
         size=4, color='darkgrey', alpha=0.2, legend='close')
p.line(ts_dates, ts_avg, color='navy', legend='avg')

# NEW: customize by setting attributes
p.title.text = "Visitors 16 Day Average"
p.legend.location = "top_left"
p.grid.grid_line_alpha=0
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Visitors'
p.ygrid.band_fill_color="olive"
p.ygrid.band_fill_alpha = 0.1

# show the results
show(p)

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries, label):
    
    #Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=14)
    rolstd = pd.rolling_std(timeseries, window=14)

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries[label], autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(ts[['log_visitors']], 'log_visitors')

In [None]:
test_stationarity(ts[['log_visitor_diff']], 'log_visitor_diff')

In [None]:
for i in range(1, 26):
    print("KPSS test for lag = ", i, kpss(ts.log_visitors, lags=i))

In [None]:
for i in range(1, 26):
    print("KPSS test for lag = ", i, kpss(ts.log_visitor_diff, lags=i))

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

acorr_ljungbox(ts.log_visitors, lags=25)

In [None]:
acorr_ljungbox(ts.log_visitor_diff, lags=25)

In [None]:
plot_acf(ts.log_visitors, lags=50)

In [None]:
plot_acf(ts.log_visitor_diff, lags=50)

In [None]:
plot_pacf(ts.log_visitors, lags=50)

In [None]:
plot_pacf(ts.log_visitor_diff, lags=50)

In [None]:
ax = sns.lmplot(x="day", y="log_visitors", x_estimator=np.mean, data=combined, hue="dow", lowess=True)

In [None]:
ax = sns.lmplot(x="month", y="visitors", x_estimator=np.mean, data=combined, hue="dow", lowess=True)

In [None]:
ax = sns.lmplot(x="day", y="visitors", x_estimator=np.mean, data=combined, hue="holiday_flg", lowess=True)

In [None]:
sns.boxplot(x="dow", y="visitors", data=train)

plt.ylim(0, 40)

In [None]:
sns.boxplot(x="holiday_flg", y="visitors", data=train)

plt.ylim(0, 40)

In [None]:
sns.boxplot(x="air_genre_name", y="visitors", data=train)

plt.ylim(0, 100)