In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import cross_val_score, TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

import matplotlib.pyplot as plt

DATA = 'data/'

In [3]:
train_df = pd.read_csv(DATA + 'sales_train.csv.gz')
test_df = pd.read_csv(DATA + 'test.csv.gz')

In [6]:
items_df = pd.read_csv(DATA + 'items.csv')
item_categories_df = pd.read_csv(DATA + 'item_categories.csv')
shops_df = pd.read_csv(DATA + 'shops.csv')

In [5]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [7]:
items_df.head()
item_categories_df.head()
shops_df.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [65]:
# columns = ['shop_id', 'item_id', 'item_price', 'item_category_id']
columns = ['shop_id', 'item_id', 'item_category_id']

In [66]:
X_train_df = train_df.merge(items_df, how="left", on='item_id')[columns]

In [67]:
X_train_df.head()

Unnamed: 0,shop_id,item_id,item_category_id
0,59,22154,37
1,25,2552,58
2,25,2552,58
3,25,2554,58
4,25,2555,56


In [68]:
cv = TimeSeriesSplit(n_splits=5)

In [69]:
X = X_train_df[train_df.item_cnt_day.between(0, 20)].values
y = train_df[train_df.item_cnt_day.between(0, 20)].item_cnt_day.values

In [99]:
def cross_val_score_lgb(X, y, cv, params, categorical_feature):

    scores = []

    for train_idx, valid_idx in cv.split(X, y):

        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        gbm = lgb.train(
            params,
            lgb_train,
            valid_sets=lgb_eval,
            early_stopping_rounds=30,
            verbose_eval=10,
            categorical_feature=categorical_feature)

        y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
        score = mean_squared_error(y_valid, y_pred)**0.5
        scores.append(score)
        print("RMSE score:", score)

    return scores

In [85]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'rmse',
    'verbose': 10
}
scores = cross_val_score_lgb(X, y, cv, params, [0, 1, 2])

New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.900844
[20]	valid_0's rmse: 0.901138
[30]	valid_0's rmse: 0.904578
[40]	valid_0's rmse: 0.907524
Early stopping, best iteration is:
[15]	valid_0's rmse: 0.899118
RMSE score: 0.8991435187579074


New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.892401
[20]	valid_0's rmse: 0.941754
[30]	valid_0's rmse: 0.98391
Early stopping, best iteration is:
[7]	valid_0's rmse: 0.884064
RMSE score: 0.8840526855367768


New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.797692
[20]	valid_0's rmse: 0.830605
[30]	valid_0's rmse: 0.858965
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.794545
RMSE score: 0.7767708807387204


New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.954302
[20]	valid_0's rmse: 0.95146
[30]	valid_0's rmse: 0.955493
[40]	valid_0's rmse: 0.958034
Early stopping, best iteration is:
[14]	valid_0's rmse: 0.950631
RMSE score: 0.9506310842309332


New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.853295
[20]	valid_0's rmse: 0.862271
[30]	valid_0's rmse: 0.872565
[40]	valid_0's rmse: 0.877711
Early stopping, best iteration is:
[11]	valid_0's rmse: 0.852643
RMSE score: 0.8505077947947103


In [101]:
scores

[0.9105078055257387,
 0.8875610391910044,
 0.7622966866334614,
 0.940692447985174,
 0.8478149495025223]

In [64]:
scores

[0.8974239382463092,
 0.8801299760857227,
 0.7718775002158844,
 0.9419799457872929,
 0.8462634747649465]

In [105]:
X_train_df['target'] = y

In [None]:
X_train_df.groupby([])

In [98]:
X = X_train_df.values

In [100]:
scores = cross_val_score_lgb(X, y, cv, params, [0, 1, 2, 3])

New categorical_feature is [0, 1, 2, 3]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.912067
[20]	valid_0's rmse: 0.914769
[30]	valid_0's rmse: 0.92084
[40]	valid_0's rmse: 0.922985
Early stopping, best iteration is:
[13]	valid_0's rmse: 0.910512
RMSE score: 0.9105078055257387


New categorical_feature is [0, 1, 2, 3]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.897394
[20]	valid_0's rmse: 0.957531
[30]	valid_0's rmse: 1.0003
Early stopping, best iteration is:
[6]	valid_0's rmse: 0.887561
RMSE score: 0.8875610391910044


New categorical_feature is [0, 1, 2, 3]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.767237
[20]	valid_0's rmse: 0.777778
[30]	valid_0's rmse: 0.799185
[40]	valid_0's rmse: 0.809928
Early stopping, best iteration is:
[12]	valid_0's rmse: 0.76603
RMSE score: 0.7622966866334614


New categorical_feature is [0, 1, 2, 3]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.95319
[20]	valid_0's rmse: 0.941134
[30]	valid_0's rmse: 0.941475
[40]	valid_0's rmse: 0.94259
[50]	valid_0's rmse: 0.943327
Early stopping, best iteration is:
[23]	valid_0's rmse: 0.940694
RMSE score: 0.940692447985174


New categorical_feature is [0, 1, 2, 3]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.848614
[20]	valid_0's rmse: 0.856032
[30]	valid_0's rmse: 0.86598
[40]	valid_0's rmse: 0.870701
Early stopping, best iteration is:
[11]	valid_0's rmse: 0.848241
RMSE score: 0.8478149495025223


In [89]:
X_test = test_df.merge(items_df, how='left', on='item_id')[columns].values

In [93]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=False, test_size=0.2)

In [94]:
lgb_train = lgb.Dataset(X_train, y_train)

lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_eval,
    early_stopping_rounds=30,
    verbose_eval=10,
    categorical_feature=[0, 1, 2])

New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
[10]	valid_0's rmse: 0.837778
[20]	valid_0's rmse: 0.840879
[30]	valid_0's rmse: 0.849376
[40]	valid_0's rmse: 0.853351
Early stopping, best iteration is:
[13]	valid_0's rmse: 0.835996


In [90]:
y_pred = model.predict(X_test, num_iteration=gbm.best_iteration)

In [91]:
submissions_df = pd.read_csv(DATA + "sample_submission.csv.gz")

In [92]:
submissions_df.item_cnt_month = y_pred
submissions_df.to_csv('submissions/simple_submission.csv', index=False)

In [22]:
scores = []

params = {
    'boosting_type': 'gbdt',
    'objective': 'rmse',
    'verbose': 0
}

for train_idx, valid_idx in cv.split(X, y):
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=[0, 1, 2, 3])
    lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=[0, 1, 2, 3], reference=lgb_train)
    
    gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval)
    
    y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    score = mean_squared_error(y_valid, y_pred) ** 0.5
    scores.append(score)
    print("RMSE score:", score)



[1]	valid_0's rmse: 1.92588
[2]	valid_0's rmse: 1.91169
[3]	valid_0's rmse: 1.89949
[4]	valid_0's rmse: 1.88828
[5]	valid_0's rmse: 1.88027
[6]	valid_0's rmse: 1.87234
[7]	valid_0's rmse: 1.86647
[8]	valid_0's rmse: 1.86034
[9]	valid_0's rmse: 1.85729
[10]	valid_0's rmse: 1.85485
[11]	valid_0's rmse: 1.85374
[12]	valid_0's rmse: 1.85242
[13]	valid_0's rmse: 1.85094
[14]	valid_0's rmse: 1.85044
[15]	valid_0's rmse: 1.84969
[16]	valid_0's rmse: 1.84723
[17]	valid_0's rmse: 1.84672
[18]	valid_0's rmse: 1.8472
[19]	valid_0's rmse: 1.84581
[20]	valid_0's rmse: 1.84573
[21]	valid_0's rmse: 1.84498
[22]	valid_0's rmse: 1.84414
[23]	valid_0's rmse: 1.8452
[24]	valid_0's rmse: 1.84588
[25]	valid_0's rmse: 1.84711
[26]	valid_0's rmse: 1.84741
[27]	valid_0's rmse: 1.84744
[28]	valid_0's rmse: 1.84802
[29]	valid_0's rmse: 1.84911
[30]	valid_0's rmse: 1.84968
[31]	valid_0's rmse: 1.84992
[32]	valid_0's rmse: 1.85058
[33]	valid_0's rmse: 1.85034
[34]	valid_0's rmse: 1.85128
[35]	valid_0's rmse: 1.85



[1]	valid_0's rmse: 1.75264
[2]	valid_0's rmse: 1.71884
[3]	valid_0's rmse: 1.69298
[4]	valid_0's rmse: 1.67931
[5]	valid_0's rmse: 1.66756
[6]	valid_0's rmse: 1.6617
[7]	valid_0's rmse: 1.6599
[8]	valid_0's rmse: 1.65894
[9]	valid_0's rmse: 1.66795
[10]	valid_0's rmse: 1.67299
[11]	valid_0's rmse: 1.68211
[12]	valid_0's rmse: 1.69045
[13]	valid_0's rmse: 1.70845
[14]	valid_0's rmse: 1.71831
[15]	valid_0's rmse: 1.72683
[16]	valid_0's rmse: 1.73347
[17]	valid_0's rmse: 1.73981
[18]	valid_0's rmse: 1.74717
[19]	valid_0's rmse: 1.75296
[20]	valid_0's rmse: 1.7581
[21]	valid_0's rmse: 1.76574
[22]	valid_0's rmse: 1.77006
[23]	valid_0's rmse: 1.77507
[24]	valid_0's rmse: 1.78436
[25]	valid_0's rmse: 1.79178
[26]	valid_0's rmse: 1.79562
[27]	valid_0's rmse: 1.80219
[28]	valid_0's rmse: 1.80819
[29]	valid_0's rmse: 1.81493
[30]	valid_0's rmse: 1.82114
[31]	valid_0's rmse: 1.82669
[32]	valid_0's rmse: 1.8331
[33]	valid_0's rmse: 1.83852
[34]	valid_0's rmse: 1.84258
[35]	valid_0's rmse: 1.8468



[1]	valid_0's rmse: 2.08269
[2]	valid_0's rmse: 2.06243
[3]	valid_0's rmse: 2.04575
[4]	valid_0's rmse: 2.03633
[5]	valid_0's rmse: 2.03024
[6]	valid_0's rmse: 2.03339
[7]	valid_0's rmse: 2.03493
[8]	valid_0's rmse: 2.0383
[9]	valid_0's rmse: 2.04028
[10]	valid_0's rmse: 2.046
[11]	valid_0's rmse: 2.05421
[12]	valid_0's rmse: 2.05476
[13]	valid_0's rmse: 2.06386
[14]	valid_0's rmse: 2.07249
[15]	valid_0's rmse: 2.07516
[16]	valid_0's rmse: 2.08018
[17]	valid_0's rmse: 2.0837
[18]	valid_0's rmse: 2.08947
[19]	valid_0's rmse: 2.09468
[20]	valid_0's rmse: 2.0972
[21]	valid_0's rmse: 2.09978
[22]	valid_0's rmse: 2.10173
[23]	valid_0's rmse: 2.1082
[24]	valid_0's rmse: 2.11315
[25]	valid_0's rmse: 2.11801
[26]	valid_0's rmse: 2.12326
[27]	valid_0's rmse: 2.12606
[28]	valid_0's rmse: 2.12806
[29]	valid_0's rmse: 2.12945
[30]	valid_0's rmse: 2.12882
[31]	valid_0's rmse: 2.13272
[32]	valid_0's rmse: 2.13583
[33]	valid_0's rmse: 2.1376
[34]	valid_0's rmse: 2.1371
[35]	valid_0's rmse: 2.13977
[3



[1]	valid_0's rmse: 3.01807
[2]	valid_0's rmse: 2.99831
[3]	valid_0's rmse: 2.98426
[4]	valid_0's rmse: 2.9745
[5]	valid_0's rmse: 2.96635
[6]	valid_0's rmse: 2.95969
[7]	valid_0's rmse: 2.95533
[8]	valid_0's rmse: 2.95152
[9]	valid_0's rmse: 2.9498
[10]	valid_0's rmse: 2.95028
[11]	valid_0's rmse: 2.95296
[12]	valid_0's rmse: 2.95117
[13]	valid_0's rmse: 2.95208
[14]	valid_0's rmse: 2.95613
[15]	valid_0's rmse: 2.95809
[16]	valid_0's rmse: 2.95975
[17]	valid_0's rmse: 2.96142
[18]	valid_0's rmse: 2.9631
[19]	valid_0's rmse: 2.96332
[20]	valid_0's rmse: 2.9639
[21]	valid_0's rmse: 2.96748
[22]	valid_0's rmse: 2.97118
[23]	valid_0's rmse: 2.9731
[24]	valid_0's rmse: 2.97772
[25]	valid_0's rmse: 2.97878
[26]	valid_0's rmse: 2.97997
[27]	valid_0's rmse: 2.98503
[28]	valid_0's rmse: 2.98451
[29]	valid_0's rmse: 2.98745
[30]	valid_0's rmse: 2.98755
[31]	valid_0's rmse: 2.98754
[32]	valid_0's rmse: 2.98859
[33]	valid_0's rmse: 2.99076
[34]	valid_0's rmse: 2.99018
[35]	valid_0's rmse: 2.993
[



[1]	valid_0's rmse: 4.40818
[2]	valid_0's rmse: 4.39865
[3]	valid_0's rmse: 4.39184
[4]	valid_0's rmse: 4.38668
[5]	valid_0's rmse: 4.38317
[6]	valid_0's rmse: 4.38036
[7]	valid_0's rmse: 4.37908
[8]	valid_0's rmse: 4.37886
[9]	valid_0's rmse: 4.37888
[10]	valid_0's rmse: 4.37908
[11]	valid_0's rmse: 4.37964
[12]	valid_0's rmse: 4.3806
[13]	valid_0's rmse: 4.38098
[14]	valid_0's rmse: 4.38194
[15]	valid_0's rmse: 4.38261
[16]	valid_0's rmse: 4.3834
[17]	valid_0's rmse: 4.38409
[18]	valid_0's rmse: 4.3855
[19]	valid_0's rmse: 4.38702
[20]	valid_0's rmse: 4.38767
[21]	valid_0's rmse: 4.38918
[22]	valid_0's rmse: 4.39076
[23]	valid_0's rmse: 4.39335
[24]	valid_0's rmse: 4.39512
[25]	valid_0's rmse: 4.39727
[26]	valid_0's rmse: 4.3987
[27]	valid_0's rmse: 4.39985
[28]	valid_0's rmse: 4.39981
[29]	valid_0's rmse: 4.40075
[30]	valid_0's rmse: 4.40142
[31]	valid_0's rmse: 4.40327
[32]	valid_0's rmse: 4.40466
[33]	valid_0's rmse: 4.40504
[34]	valid_0's rmse: 4.40532
[35]	valid_0's rmse: 4.4066

In [23]:
scores

[1.8709421884079547,
 1.9316374620435401,
 2.0961574973756365,
 3.0215313294871438,
 4.41560019966994]