In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import tensorflow as tf

In [52]:
SALES_TRAIN_V2 = "data/sales_train_v2_fv2.csv"
SAMPLE_SUBMISSION = "data/sample_submission.csv"
TEST = "data/test_fv2.csv"
OUTPUT = "output"
FEATURES = [
    #'shop_id',
    #'item_id',
    'total_cat_cnt',
    'min_cat_cnt',
    'max_cat_cnt',
    'mean_cat_cnt',
    'std_cat_cnt',
    'min_cat_price',
    'max_cat_price',
    'mean_cat_price',
    'std_cat_price',
    'total_shop_cnt',
    'min_shop_cnt',
    'max_shop_cnt',
    'mean_shop_cnt',
    'std_shop_cnt',
    'min_shop_price',
    'max_shop_price',
    'mean_shop_price',
    'std_shop_price'
]

In [92]:
# Dev dataset
sales_train = pd.read_csv(SALES_TRAIN_V2)

# Test & sample
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
test = pd.read_csv(TEST)

In [7]:
def rmse(y_true, y_pred):
    '''
    Root mean squared error.
    Parameters
    ----------
    y_true: ndarray
        Ground truth
    y_pred: ndarray
        Array of predictions
    Returns
    -------
    rmsle: float
        Root mean squared error
    References
    ----------
    .. [1] https://www.kaggle.com/wiki/RootMeanSquaredError
    '''

    # Check shapes
    #y_true, y_pred = align_shape(y_true, y_pred)
    return np.sqrt(((y_true - y_pred)**2).mean())

In [8]:
sales_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,total_cat_cnt,min_cat_cnt,...,std_cat_price,total_shop_cnt,min_shop_cnt,max_shop_cnt,mean_shop_cnt,std_shop_cnt,min_shop_price,max_shop_price,mean_shop_price,std_shop_price
0,02.01.2013,0,59,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37,203284.0,-3.0,...,299.681484,46256.0,-1.0,100.0,1.166961,1.149310,0.10,32990.0,942.244232,1906.464127
1,03.01.2013,0,25,2552,899.00,1.0,DEEP PURPLE The House Of Blue Light LP,58,13631.0,-1.0,...,1051.678052,81734.0,-2.0,44.0,1.144077,0.752636,0.07,32990.0,818.345743,1711.236198
2,05.01.2013,0,25,2552,899.00,-1.0,DEEP PURPLE The House Of Blue Light LP,58,13631.0,-1.0,...,1051.678052,81734.0,-2.0,44.0,1.144077,0.752636,0.07,32990.0,818.345743,1711.236198
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,13631.0,-1.0,...,1051.678052,81734.0,-2.0,44.0,1.144077,0.752636,0.07,32990.0,818.345743,1711.236198
4,15.01.2013,0,25,2555,1099.00,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,8281.0,-1.0,...,550.847877,78079.0,-2.0,58.0,1.122260,0.802511,0.10,32990.0,743.052339,1544.768269
5,10.01.2013,0,25,2564,349.00,1.0,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,6611.0,-1.0,...,378.836804,48993.0,-2.0,80.0,1.163508,1.026188,0.10,32997.5,884.981227,1815.446620
6,02.01.2013,0,25,2565,549.00,1.0,DEEP PURPLE Stormbringer (фирм.),56,8281.0,-1.0,...,550.847877,78079.0,-2.0,58.0,1.122260,0.802511,0.10,32990.0,743.052339,1544.768269
7,04.01.2013,0,25,2572,239.00,1.0,DEFTONES Koi No Yokan,55,348591.0,-2.0,...,113.765602,63388.0,-1.0,637.0,1.823118,8.002811,4.00,27900.0,607.893809,833.904686
8,11.01.2013,0,25,2572,299.00,1.0,DEFTONES Koi No Yokan,55,348591.0,-2.0,...,113.765602,63388.0,-1.0,637.0,1.823118,8.002811,4.00,27900.0,607.893809,833.904686
9,03.01.2013,0,25,2573,299.00,3.0,DEL REY LANA Born To Die,55,348591.0,-2.0,...,113.765602,63388.0,-1.0,637.0,1.823118,8.002811,4.00,27900.0,607.893809,833.904686


In [9]:
# Generate cross validation split
# Here we generate random split and we don't care about temporal order
from sklearn.model_selection import KFold

In [10]:
# Training dataset
# def get_random_split():
#     '''Returns a random split of sales_train_v2 dataset'''
#     X = sales_train[['shop_id', 'item_id']].values
#     y = sales_train['item_cnt_day'].values
#     return train_test_split(X, y, test_size=.2)

In [93]:
kfold = KFold(n_splits=5, shuffle=True)

folds = {}
fold_num = 1

for train_ids, test_ids in kfold.split(range(len(sales_train))):
    train_ids, val_ids = train_test_split(train_ids, test_size=.2)
    folds[fold_num] = {"train": list(train_ids),
                      "test": list(test_ids),
                      "val": list(val_ids)}
    fold_num += 1

In [38]:
import json

with open("splits/unordered_folds.json", "w") as f:
    json.dump(folds, f)

In [109]:
# Normalizing features
normalized_sales_train=(sales_train[FEATURES]-sales_train[FEATURES].mean())/sales_train[FEATURES].std()
X_sales_train = normalized_sales_train.values
y_sales_train = sales_train['item_cnt_day'].values

## Experiment

In [110]:
# def get_training_dataset(idx, sales_train):
#     X = sales_train[FEATURES].loc[idx].values
#     y = sales_train['item_cnt_day'].loc[idx].values
#     return X, y

# def get_training_dataset(idx, sales_train):
#     return sales_train

In [111]:
X_check, y_check = get_training_dataset(range(10), sales_train)

In [112]:
X_check.shape

(10, 18)

### TensorFlow model

In [123]:
X  = tf.placeholder(tf.float32, [None, X_check.shape[1]])
y = tf.placeholder(tf.float32, [None, ])

In [124]:
with tf.device("/gpu:0"):
    net = tf.layers.dense(X, 30, activation=tf.nn.relu)
    net = tf.layers.dense(net, 10, activation=tf.nn.relu)
    net = tf.layers.dense(net, 1, activation=tf.identity)
    net = tf.clip_by_value(net, -20, 10)

In [125]:
# Loss function
loss = tf.reduce_mean(tf.sqrt(tf.reduce_mean((y - net)**2)))

In [126]:
opt = tf.train.AdamOptimizer()
opt_op = opt.minimize(loss)

In [127]:
init = tf.global_variables_initializer()

In [128]:
# Dry run
# with tf.Session() as sess:
#     sess.run(init) 
#     y_pred = sess.run(net, feed_dict = {X: X_check})
    

In [129]:
import tqdm
def batch_iterator(idx, batch_size):
    ln = len(idx)
    for i in range(0, ln, batch_size):
        yield idx[i:i + batch_size]

In [130]:
import time

In [131]:
N_EPOCHS = 25
BATCH_SIZE = 32

In [132]:
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    sess.run(init) 
    
    fold_results = []
    
    for fold in folds:
       
        train_idx = folds[fold]['train']
        val_idx = folds[fold]['val']
        test_idx = folds[fold]['test']
        
        print 
        print "Fold: ", fold, len(train_idx), len(val_idx), len(test_idx)
        train_idx = train_idx[:10000]
        
        
        train_loss_history = []
        val_loss_history = []

        for n in range(N_EPOCHS):
            start = time.time()
            train_batch_losses = []  
            #val_batch_losses = []
            
            # TRAINING
            for train_batch_idx in batch_iterator(train_idx, BATCH_SIZE):
                X_train, y_train = X_sales_train[train_batch_idx], y_sales_train[train_batch_idx]
                #print X_train.shape
                #print X_train.shape, y_train.shape
                y_pred, train_loss_value, _ = sess.run([net, loss, opt_op], feed_dict = {X: X_train, y: y_train})
                train_batch_losses.append(train_loss_value)
                
            # VALIDATION
            X_test, y_test = get_training_dataset(train_batch_idx, sales_train)
            y_pred, val_epoch_loss, = sess.run([net, loss], feed_dict = {X: X_test, y: y_test})
            #val_batch_losses.append(test_loss_value)
                
            end = time.time()  
            
            train_epoch_loss = np.mean(train_batch_losses)
            #val_epoch_loss = np.mean(val_batch_losses)
            
            train_loss_history.append(train_epoch_loss)
            val_loss_history.append(val_epoch_loss)
            
            print "Epoch: {} | {} s| train. loss: {} | val. loss: {}".format(n+1, end -  start, train_epoch_loss,
                                                                            val_epoch_loss)
        # TESTING
        X_test, y_test = get_training_dataset(train_batch_idx, sales_train)
        y_pred, test_loss_value, = sess.run([net, loss], feed_dict = {X: X_test, y: y_test})
  
        


Fold:  1 1878943 469736 587170
Epoch: 1 | 0.426625013351 s| train. loss: 1.1094379425 | val. loss: 3.29880213737
Epoch: 2 | 0.391461133957 s| train. loss: 0.874316036701 | val. loss: 3.29868650436
Epoch: 3 | 0.391869068146 s| train. loss: 0.874376773834 | val. loss: 3.29901504517
Epoch: 4 | 0.395262956619 s| train. loss: 0.874418199062 | val. loss: 3.29925489426
Epoch: 5 | 0.384027957916 s| train. loss: 0.874446570873 | val. loss: 3.29942512512
Epoch: 6 | 0.383312940598 s| train. loss: 0.874466478825 | val. loss: 3.29954648018
Epoch: 7 | 0.387673139572 s| train. loss: 0.874480605125 | val. loss: 3.29963231087
Epoch: 8 | 0.389658927917 s| train. loss: 0.874490737915 | val. loss: 3.29969453812
Epoch: 9 | 0.381841897964 s| train. loss: 0.874498069286 | val. loss: 3.29973840714
Epoch: 10 | 0.395797014236 s| train. loss: 0.874503314495 | val. loss: 3.29977011681
Epoch: 11 | 0.386109828949 s| train. loss: 0.874507009983 | val. loss: 3.29979276657
Epoch: 12 | 0.393099069595 s| train. loss: 0

## Kaggle submission

In [42]:
X = sales_train[['shop_id', 'item_id']].values
y = sales_train['item_cnt_day'].values

X, y = shuffle(X, y)

ctb = catboost.CatBoostRegressor(task_type='GPU', logging_level="Silent")
    
# Training
ctb.fit(X_test, y_test, cat_features=[0, 1])

<catboost.core._CatBoostBase at 0x7f45c24c5510>

In [45]:
test_ids = test['ID']
X_test = test[['shop_id', 'item_id']].values

In [46]:
res = ctb.predict(X_test)

In [58]:
submission = pd.DataFrame([test_ids, res]).T
submission.columns = ["ID", "item_cnt_month"]
submission.ID = submission.ID.astype(int)

In [62]:
submission.to_csv("baseline.csv", index=None)