In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("../sales_train.csv.gz")
test_df  = pd.read_csv("../test.csv.gz")

categories = pd.read_csv('../item_categories.csv')
items = pd.read_csv('../items.csv')

In [2]:
%store -r __prepare_data

In [3]:
%%time
__prepare_data

CPU times: user 1min 27s, sys: 41.5 s, total: 2min 9s
Wall time: 2min 9s


I saw the date filed has the month and day fields mixed up in the format for some entries, hence I reassign date using date_block_num

In [4]:
from calendar import monthrange

# January, 2013 is date_block_num==0
timeline = pd.DataFrame({'begin_date': pd.date_range(start='2013-01-01',end='2015-11-01', freq='MS'),
                         'date_block_num': range(35)} )
    
timeline['n_days'] = timeline.begin_date.apply(lambda date: monthrange(date.year,date.month)[1])

Let us for now work with zero-suppressed dataframe of monthly sales

In [5]:
monthly = train_df.groupby(["item_id","shop_id","date_block_num"]).item_cnt_day.sum()
monthly = monthly.reset_index()
monthly.columns = ["item_id","shop_id","date_block_num","target"]

In [6]:
%%time
from statsmodels.tsa.seasonal import seasonal_decompose

#import matplotlib.pyplot as plt
#%matplotlib inline

shop_monthly_sales = monthly.groupby(["shop_id","date_block_num"]).target.sum().reset_index("shop_id")

extrapolate_shop = pd.DataFrame()

for shop in shop_monthly_sales.shop_id.unique():

#    print(shop)

    sales = shop_monthly_sales[shop_monthly_sales.shop_id==shop]
    sales_ts = pd.merge(timeline[0:-1], sales, how='left', on=['date_block_num'])

    sales_ts = sales_ts.fillna(0)

    ts = pd.Series(sales_ts.target.values, index=sales_ts["begin_date"])
    decomposition = seasonal_decompose(ts, extrapolate_trend=1, freq=12)

    trend    = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid

#    plt.plot(trend, label='Trend')
    extrapolate_shop = pd.concat([extrapolate_shop,
                             pd.DataFrame({
                                 'date_block_num':  range(34),
                                 'shop_id':         [shop]*34,
                                 'shop_trend':      trend.values,
                                 'shop_seasonal':   seasonal.values,
                                 'shop_forecast':   trend.values + seasonal.values
                             })])

prev_month = extrapolate_shop[extrapolate_shop.date_block_num==33][['shop_id','shop_trend']]
prev_year  = extrapolate_shop[extrapolate_shop.date_block_num==(34-12)][['shop_id','shop_seasonal']]

month_34 = pd.merge(prev_month, prev_year, how='inner', on=['shop_id'])
month_34['date_block_num'] = 34
month_34['shop_forecast'] = month_34['shop_trend'] + month_34['shop_seasonal']

extrapolate_shop = pd.concat([extrapolate_shop,month_34])

CPU times: user 588 ms, sys: 96.9 ms, total: 685 ms
Wall time: 701 ms


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [7]:
all_data = pd.merge(all_data, extrapolate_shop, how='left', on=['date_block_num','shop_id'])

In [8]:
train = all_data[(all_data.date_block_num>12)&(all_data.date_block_num<34)].fillna(0)

predictors += ['shop_trend', 'shop_seasonal', 'shop_forecast'] 

X_train = train[train.date_block_num <  33][predictors]
X_valid = train[train.date_block_num == 33][predictors]
y_train = train[train.date_block_num <  33].target
y_valid = train[train.date_block_num == 33].target

In [9]:
%%time
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=4)
model1.fit(X_train, y_train)
ypred = model1.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.963735
CPU times: user 31min 17s, sys: 28.1 s, total: 31min 45s
Wall time: 9min


In [10]:
from tabulate import tabulate
headers = ["name", "score"]
values = sorted(zip(X_valid.columns, model1.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                              score
total_monthly_item_sales_lag_1                0.391623
total_monthly_shop_category_sales_lag_1       0.0849974
total_monthly_shop_subcategory_sales_lag_1    0.0710883
target_lag_1                                  0.0638946
total_monthly_category_sales_lag_1            0.0418803
target_lag_2                                  0.0413936
total_monthly_shop_supercategory_sales_lag_1  0.0380284
shop_forecast                                 0.0367884
total_monthly_subcategory_sales_lag_1         0.0330769
shop_trend                                    0.0329517
total_monthly_shop_sales_lag_1                0.0289756
target_lag_3                                  0.0256922
shop_seasonal                                 0.0252852
total_monthly_supercategory_sales_lag_1       0.0247985
target_lag_6                                  0.0244406
target_lag_12                                 0.0198182
total_monthly_sales_lag_1                     0.0

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
full_model = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=6) #len(predictors)/2
full_model.fit(train[predictors], train.target)

In [None]:
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")

test[test.item_cnt_month>20] = 20
test.to_csv("shop_trends.csv")

Now let us do the same for items

In [19]:
%%time
#import matplotlib.pyplot as plt
#%matplotlib inline

item_monthly_sales = monthly.groupby(["item_id","date_block_num"]).target.sum().reset_index("item_id")

extrapolate_item = pd.DataFrame()

for item in item_monthly_sales.item_id.unique():

#    print(item)
    
    sales = item_monthly_sales[item_monthly_sales.item_id==item]
    sales_ts = pd.merge(timeline[0:-1], sales, how='left', on=['date_block_num'])

    sales_ts = sales_ts.fillna(0)

    ts = pd.Series(sales_ts.target.values, index=sales_ts["begin_date"])
    decomposition = seasonal_decompose(ts, extrapolate_trend=1, freq=12)

    trend    = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid

#    plt.plot(trend, label='Trend')
    extrapolate_item = pd.concat([extrapolate_item,
                             pd.DataFrame({
                                 'date_block_num':  range(34),
                                 'item_id':         [item]*34,
                                 'item_trend':      trend.values,
                                 'item_seasonal':   seasonal.values,
                                 'item_forecast':   trend.values + seasonal.values
                             })])

prev_month = extrapolate_item[extrapolate_item.date_block_num==33][['item_id','item_trend']]
prev_year  = extrapolate_item[extrapolate_item.date_block_num==(34-12)][['item_id','item_seasonal']]

month_34 = pd.merge(prev_month, prev_year, how='inner', on=['item_id'])
month_34['date_block_num'] = 34
month_34['item_forecast'] = month_34['item_trend'] + month_34['item_seasonal']

extrapolate_item = pd.concat([extrapolate_item,month_34])
extrapolate_item.shape

CPU times: user 4min 25s, sys: 1min 35s, total: 6min
Wall time: 6min 1s


(741438, 5)

In [23]:
import gc
del month_34
gc.collect()

1410

In [26]:
all_data = pd.merge(all_data, extrapolate_item, how='left', on=['date_block_num','item_id'])

CPU times: user 4.76 s, sys: 7.94 s, total: 12.7 s
Wall time: 14 s


In [27]:
train = all_data[(all_data.date_block_num>12)&(all_data.date_block_num<34)].fillna(0)

predictors += ['item_trend', 'item_seasonal', 'item_forecast'] 

X_train = train[train.date_block_num <  33][predictors]
X_valid = train[train.date_block_num == 33][predictors]
y_train = train[train.date_block_num <  33].target
y_valid = train[train.date_block_num == 33].target

In [32]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
model2 = RandomForestRegressor(n_estimators=64, max_features=4, n_jobs=4)
model2.fit(X_train, y_train)
ypred = model2.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.941626
CPU times: user 9.15 s, sys: 9.35 s, total: 18.5 s
Wall time: 7.02 s


In [30]:
%%time
from sklearn.ensemble import RandomForestRegressor
full_model2 = RandomForestRegressor(n_estimators=64, max_features=4, n_jobs=6) #len(predictors)/2
full_model2.fit(train[predictors], train.target)

CPU times: user 53min 45s, sys: 29.1 s, total: 54min 15s
Wall time: 14min 14s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=6,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [31]:
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model2.predict(month_34[predictors])
test = pd.merge(test_df, month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")

test[test.item_cnt_month>20] = 20
test.to_csv("shop_item_trends.csv")