In [1]:
from datetime import date, timedelta,datetime

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv(
    'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)


In [3]:
df_test = pd.read_csv(
    "test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "items.csv",
).set_index("item_nbr")

In [23]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [6]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train


In [7]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)


In [9]:
#祝日と平日での売り上げの違いからむをさくせい

In [8]:
items = items.reindex(df_2017.index.get_level_values(1))

In [9]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]


In [11]:
def get_nearwd(date,b_date):
    date_list = pd.date_range(date-timedelta(140),periods= 21,freq="7D").date
    result = date_list[date_list <= b_date][-1]
    return result

In [32]:
t2017 = date(2017, 5, 31)

datetime.date(2017, 5, 31)

In [33]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values,
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values

    for i in [1,2,3,4,5,6,7,14,15]:#+14,15で5.10
        
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [34]:
t2017 = date(2017, 5, 31)

In [None]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


In [17]:
print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

Training and predicting models...


In [17]:
#lgbmを回してimportanceを見て低いものを削除する
#holidayとpaydayで重みづけをする

In [45]:
cate_vars = []
dtrain = lgb.Dataset(
    X_train,label=y_train[:,1],
    categorical_feature=cate_vars,
    weight=pd.concat([items["perishable"]]*6)*0.25+1
)


In [46]:
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves = 31,
                        n_estimators=100)

In [47]:
gbm.fit(X_train, y_train[:,1],
        verbose=0)

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
       objective='regression', random_state=0, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=50000,
       subsample_freq=1)

In [48]:
fti = gbm.feature_importances_

In [49]:
print("Feature Importances:")
for i,feat in enumerate(X_train.columns):
    print('\t{0:10s} : {1:>12.4f}'.format(feat, fti[i]))

Feature Importances:
	day_1_2017 :     204.0000
	mean_140_2017 :      39.0000
	mean_14_2017 :     131.0000
	mean_30_2017 :     103.0000
	mean_3_2017 :     181.0000
	mean_60_2017 :     129.0000
	mean_7_2017 :     201.0000
	promo_140_2017 :     103.0000
	promo_14_2017 :     222.0000
	promo_60_2017 :      89.0000
	mean_4_dow0_2017 :      51.0000
	mean_20_dow0_2017 :      41.0000
	mean_4_dow1_2017 :     116.0000
	mean_20_dow1_2017 :     215.0000
	mean_4_dow2_2017 :      67.0000
	mean_20_dow2_2017 :      90.0000
	mean_4_dow3_2017 :      48.0000
	mean_20_dow3_2017 :      37.0000
	mean_4_dow4_2017 :      64.0000
	mean_20_dow4_2017 :     148.0000
	mean_4_dow5_2017 :      49.0000
	mean_20_dow5_2017 :      30.0000
	mean_4_dow6_2017 :      49.0000
	mean_20_dow6_2017 :      53.0000
	promo_0    :      93.0000
	promo_1    :     220.0000
	promo_2    :      48.0000
	promo_3    :      26.0000
	promo_4    :      35.0000
	promo_5    :      14.0000
	promo_6    :      19.0000
	promo_7    :      26.0000
	pr

In [25]:
MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))


Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.328335	valid_1's l2: 0.324624
[200]	training's l2: 0.324725	valid_1's l2: 0.323579
[300]	training's l2: 0.322159	valid_1's l2: 0.32326
[400]	training's l2: 0.319869	valid_1's l2: 0.323172
[500]	training's l2: 0.317876	valid_1's l2: 0.323176
Early stopping, best iteration is:
[450]	training's l2: 0.31885	valid_1's l2: 0.323147
mean_14_2017: 1655794.82
mean_7_2017: 1474817.06
mean_3_2017: 125928.26
day_1_2017: 101520.17
mean_20_dow0_2017: 83068.13
mean_4_dow0_2017: 49264.82
mean_30_2017: 34367.88
mean_60_2017: 19344.35
promo_7: 14102.71
mean_4_dow5_2017: 6823.59
mean_20_dow4_2017: 6120.66
mean_140_2017: 5711.17
promo_1: 4925.96
promo_140_2017: 4337.28
promo_60_2017: 4290.65
mean_4_dow6_2017: 4063.05
mean_4_dow2_2017: 3838.38
mean_20_dow2_2017: 3467.35
mean_4_dow1_2017: 3232.66
promo_14_2017: 3222.69
mean_20_dow6_2017: 3103.28
mean_4_dow4_2017: 3028.64
mean_4_dow3_2017: 2857.09
mean_20_dow3_2017: 2852.01


Step 8
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.33459	valid_1's l2: 0.392004
[200]	training's l2: 0.330494	valid_1's l2: 0.391163
[300]	training's l2: 0.327613	valid_1's l2: 0.390735
[400]	training's l2: 0.325121	valid_1's l2: 0.390685
[500]	training's l2: 0.32301	valid_1's l2: 0.39054
mean_14_2017: 1213292.65
mean_30_2017: 1078613.01
mean_7_2017: 626277.87
promo_7: 170998.62
mean_20_dow0_2017: 151764.03
mean_60_2017: 77186.31
mean_4_dow0_2017: 64353.20
promo_14_2017: 24668.67
mean_3_2017: 20195.61
promo_14: 18733.53
day_1_2017: 17725.20
promo_140_2017: 11370.00
promo_60_2017: 10986.75
promo_3: 8566.04
mean_20_dow2_2017: 8381.58
mean_20_dow4_2017: 5900.98
mean_4_dow5_2017: 5804.80
promo_5: 5602.46
mean_140_2017: 4687.60
promo_6: 4536.02
mean_20_dow1_2017: 3893.91
mean_4_dow6_2017: 3841.96
promo_15: 3735.07
mean_4_dow1_2017: 3402.73
mean_4_dow2_2017: 3351.95
mean_20_dow3_2017: 3191.55
mean_20_dow6_2017: 3115.86
mean_20_dow5_2017: 3040.00
mean_

Step 15
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.349724	valid_1's l2: 0.350793
[200]	training's l2: 0.345585	valid_1's l2: 0.349966
Early stopping, best iteration is:
[239]	training's l2: 0.344446	valid_1's l2: 0.349809
mean_30_2017: 1429769.90
mean_14_2017: 748132.48
mean_7_2017: 451512.79
mean_20_dow0_2017: 242070.78
promo_14: 207809.98
mean_60_2017: 147143.54
mean_4_dow0_2017: 52889.14
promo_14_2017: 22536.18
promo_7: 20139.70
day_1_2017: 14551.68
promo_60_2017: 12216.99
mean_20_dow2_2017: 12014.22
mean_3_2017: 10991.33
promo_140_2017: 9456.85
mean_20_dow4_2017: 5544.21
promo_15: 4662.91
mean_4_dow2_2017: 4358.08
mean_140_2017: 4147.26
mean_20_dow1_2017: 3531.21
promo_2: 2589.88
mean_4_dow5_2017: 2580.30
mean_20_dow6_2017: 2284.31
mean_20_dow3_2017: 2018.60
promo_6: 2013.95
mean_4_dow1_2017: 2004.16
mean_4_dow6_2017: 1962.10
mean_20_dow5_2017: 1752.49
mean_4_dow4_2017: 1686.20
mean_4_dow3_2017: 1521.42
promo_4: 1004.02
promo_3: 929.11
prom

In [85]:

y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")


In [87]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1
1,96995,2017-08-16,0.181577
1,96995,2017-08-17,0.186332
1,96995,2017-08-18,0.218165
1,96995,2017-08-19,0.240331
1,96995,2017-08-20,0.237360
1,96995,2017-08-21,0.213629
1,96995,2017-08-22,0.201206
1,96995,2017-08-23,0.179157
1,96995,2017-08-24,0.191265
1,96995,2017-08-25,0.239770


In [27]:

df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb4.csv', float_format='%.4f', index=None)

In [35]:
holiday = pd.read_csv("holiday_substrace.csv")
del(holiday["Unnamed: 0"])
holiday["diffrence"] = holiday["unit_sales"]
del(holiday["unit_sales"])
holiday["diffrence"] = holiday["diffrence"].apply(lambda x : -x)

In [42]:
pred = df_preds.reset_index() 

In [48]:
pred = pd.merge(pred,holiday,how="left",on=["store_nbr","item_nbr"])

In [52]:
pred.columns

Index(['store_nbr', 'item_nbr', 'date', 'unit_sales', 'diffrence'], dtype='object')

In [54]:
pred[pred.date == datetime(2017,8,24)]["unit_sales"] += pred[pred.date == datetime(2017,8,24)]["diffrence"] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
pred["unit_sales"] = pred["unit_sales"].apply(lambda x:x if 0<x else 0)

In [58]:
del(pred["diffrence"])

In [88]:
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.181577
1,96995,2017-08-17,0.186332
1,96995,2017-08-18,0.218165
1,96995,2017-08-19,0.240331
1,96995,2017-08-20,0.237360
1,96995,2017-08-21,0.213629
1,96995,2017-08-22,0.201206
1,96995,2017-08-23,0.179157
1,96995,2017-08-24,0.191265
1,96995,2017-08-25,0.239770


In [59]:
pred.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb8.csv', float_format='%.4f', index=None)

ValueError: Length of new names must be 1, got 3

In [98]:
del(df_test["onpromotion"])

In [101]:
df_test = df_test.reset_index()

In [113]:
sub

Unnamed: 0,unit_sales,id
0,0.181577,125497040.0
1,0.186332,125707694.0
2,0.218165,125918348.0
3,0.240331,126129002.0
4,0.237360,126339656.0
5,0.213629,126550310.0
6,0.201206,126760964.0
7,0.179157,126971618.0
8,0.191265,127182272.0
9,0.239770,127392926.0


In [112]:
sub = pd.merge(pred,df_test,how="left",on=["store_nbr","item_nbr","date"])
del(sub["store_nbr"])
del(sub["item_nbr"])
del(sub["date"])

In [114]:
sub["unit_sales"] = np.clip(np.expm1(sub["unit_sales"]), 0, 1000)
sub.to_csv('lgb9.csv', float_format='%.4f', index=None)

PermissionError: [Errno 13] Permission denied

In [None]:
}