In [1]:
from datetime import date, timedelta
import calendar as ca
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

In [2]:
print('Loading Data')
df_train = pd.read_csv('train.csv',dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"], skiprows=range(1, 101688780) )
# read data starting from 2017-1-1

Loading Data


In [3]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

In [4]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [5]:
promo_2017_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack().fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [6]:
promo_2017_test = df_test[["onpromotion"]].unstack().fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [7]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [8]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [9]:
df_2017 = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack().fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [10]:
items = items.reindex(df_2017.index.get_level_values(1))

In [12]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [13]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
        
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values 
        
    for i in range(1,15,1):
        X['day_{}_2017'.format(i)] = get_timespan(df_2017, t2017, i, 1).values.ravel()
        
    for i in range(2,15,1):
        X['mean_{}_2017'.format(i)] = get_timespan(df_2017, t2017, i, i).mean(axis=1).values
        
    X['mean_30_2017'.format(i)] = get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values
    X['mean_60_2017'.format(i)] = get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values
    X['mean_140_2017'.format(i)] = get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values

    if is_train:
        y = df_2017[pd.date_range(t2017, periods=16)].values
        return X, y
        
    return X

In [14]:
t2017 = date(2017, 7, 5)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [21]:
val_pred = []
test_pred = []
model = [0] * 16

for i in range(16):
    ols = linear_model.LinearRegression(normalize=True)
    model[i] = ols.fit(X_train, y_train[:,i],sample_weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1)
    val_pred.append(model[i].predict(X_val))
    test_pred.append(model[i].predict(X_test))

In [22]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose())**0.5)

Validation mse: 0.612940416523


In [23]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_2017.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [24]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 10000)
submission.shape

(3370464, 2)

In [25]:
submission.to_csv('linear_regression.csv', float_format='%.4f', index=None)