In [3]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')


In [4]:
item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items=pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [5]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [6]:
items = items.merge(item_cat, on='item_category_id', how='left')
train=train.merge(shops, on="shop_id", how="left")
######  merged all files into train like this
train = train.merge(items, on='item_id', how='left')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   date                object 
 1   date_block_num      int64  
 2   shop_id             int64  
 3   item_id             int64  
 4   item_price          float64
 5   item_cnt_day        float64
 6   shop_name           object 
 7   item_name           object 
 8   item_category_id    int64  
 9   item_category_name  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 246.4+ MB


In [8]:
for_merchant = train.groupby("shop_id").agg({"item_cnt_day":"sum"}).reset_index()
train = train.merge(for_merchant, on="shop_id",how="left")
train = train.rename(columns=lambda x: x.replace('item_cnt_day_y', 'total_sales_per_shop'))
train = train.rename(columns=lambda x: x.replace('item_cnt_day_x', 'item_cnt_day'))

In [9]:
train["total_paid"] = train["item_price"]*train["item_cnt_day"]
total_paids = train.groupby("shop_id").agg({"total_paid":"sum"}).reset_index()
train = train.merge(total_paids, on="shop_id",how="left")

In [10]:
train["date"] = pd.to_datetime(train["date"])
train.shape

(2935849, 13)

In [11]:
train["date"].min(), train["date"].max()

(Timestamp('2013-01-01 00:00:00'), Timestamp('2015-12-10 00:00:00'))

In [12]:
##delete shop_name, item_name, item_category_name
drop_list = ["shop_name", "item_name", "item_category_name"]
train.drop(drop_list, axis=1, inplace=True)

In [13]:
#for id in train.shop_id.unique():
    #plt.figure(figsize=(15, 15))
   # plt.subplot(3, 1, 1, title = str(id) + ' 2013-2014 total_sales_per_shop')
    #train[(train.shop_id == id) &( train.date >= "2013-01-01" )& (train.date < "2014-01-01")]["item_cnt_day"].plot()
    #plt.xlabel('')
    #plt.subplot(3, 1, 2,title = str(id) + ' 2015-2016 Transaction')
    #train[(train.shop_id == id) &( train.date >= "2014-01-01" )& (train.date < "2015-11-10")]["item_cnt_day"].plot()
    #plt.xlabel('')
    #plt.show()

def create_date_features(df, date_column):
    df['month'] = df[date_column].dt.month
    df['day_of_month'] = df[date_column].dt.day
    df['day_of_year'] = df[date_column].dt.dayofyear
    df['week_of_year'] = df[date_column].dt.weekofyear
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['year'] = df[date_column].dt.year
    df["is_wknd"] = df[date_column].dt.weekday // 4
    df['is_month_start'] =df[date_column].dt.is_month_start.astype(int)
    df['is_month_end'] = df[date_column].dt.is_month_end.astype(int)
    df['quarter'] = df[date_column].dt.quarter
    df['is_quarter_start'] = df[date_column].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df[date_column].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df[date_column].dt.is_year_start.astype(int)
    df['is_year_end'] = df[date_column].dt.is_year_end.astype(int)
    return df

In [14]:
train = create_date_features(train, "date")

In [15]:
# Lag/Shifted Features


def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["shop_id"])['total_sales_per_shop'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe


train = lag_features(train, [30,60,91,92,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
                       350,351,352,352,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,
                       538,539,540,541,542,
                       718,719,720,721,722])




In [16]:
# Rolling Mean Features


def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby("shop_id")['total_sales_per_shop']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

train = roll_mean_features(train, [30,60,91,92,178,179,180,181,182,359,360,361,449,450,451,539,540,541,629,630,631,720])



In [17]:
# Exponentially Weighted Mean Features


def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby("shop_id")['total_sales_per_shop'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [30,60,91,92,178,179,180,181,182,359,360,361,449,450,451,539,540,541,629,630,631,720]

train = ewm_features(train, alphas, lags)

In [18]:
train["is_black_friday"] = 0
train.loc[train["date"].isin(["2013-11-22","2013-11-23","2014-11-29","2014-11-30"]) ,"is_black_friday"]=1

train["is_summer_solstice"] = 0
train.loc[train["date"].isin(["2013-06-19","2013-06-20","2013-06-21","2013-06-22",
                                    "2014-06-19","2014-06-20","2014-06-21","2014-06-22",]) ,"is_summer_solstice"]=1

In [19]:
train = pd.get_dummies(train, columns=['shop_id','day_of_week', 'month'])
train['total_sales_per_shop'] = np.log1p(train["total_sales_per_shop"].values)


In [20]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [21]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# 2015'nin 10.ayına kadar train seti.
train_ = train.loc[(train["date"] < "2015-10-01"), :]

# 2020'nin son 3 ayı validasyon seti.
val_ = train.loc[(train["date"] >= "2015-10-01"), :]

cols = [col for col in train.columns if col not in ['date', 'id', "total_sales_per_shop","total_Paid", "year" ]]

Y_train = train_['total_sales_per_shop']
X_train = train_[cols]

Y_val = val_['total_sales_per_shop']
X_val = val_[cols]

In [22]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 1000,
              'early_stopping_rounds': 200,
              'nthread': -1}


lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)

You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 200 rounds
[100]	training's l1: 0.0828169	training's SMAPE: 8.25976	valid_1's l1: 0.090701	valid_1's SMAPE: 8.84189
[200]	training's l1: 0.0117302	training's SMAPE: 1.17265	valid_1's l1: 0.0158717	valid_1's SMAPE: 1.53591
[300]	training's l1: 0.00225235	training's SMAPE: 0.225182	valid_1's l1: 0.00563644	valid_1's SMAPE: 0.534561
[400]	training's l1: 0.00115294	training's SMAPE: 0.115279	valid_1's l1: 0.00439322	valid_1's SMAPE: 0.414322
[500]	training's l1: 0.000995932	training's SMAPE: 0.099587	valid_1's l1: 0.00419387	valid_1's SMAPE: 0.395601
[600]	training's l1: 0.000946525	training's SMAPE: 0.0946486	valid_1's l1: 0.00413832	valid_1's SMAPE: 0.390343
[700]	training's l1: 0.00091599	training's SMAPE: 0.0915964	valid_1's l1: 0.00411668	valid_1's SMAPE: 0.388439
[800]	training's l1: 0.000887533	training's SMAPE: 0.0887513	valid_1's l1: 0.00409762	valid_1's SMAPE: 0.386549
[90