In [1]:
import csv
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
import sys

def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        if row_index == 0:
            keys = row
            print(row)
            continue
        # if row_index % 10000 == 0:
        #     print(row_index)
        data.append({key: value for key, value in zip(keys, row)})
    return data


def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x

In [2]:
train_data_filename = "dataset/rossmann/train.csv"
store_data_filename = "dataset/rossmann/store.csv"
store_states_filename = 'dataset/rossmann/store_states.csv'

In [3]:
csvfile = open(train_data_filename)
train_data = csv.reader(csvfile, delimiter=',')
train_data = csv2dicts(train_data)
train_data = train_data[::-1]

print(train_data[:3])

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]


In [4]:
csvfile = open(store_data_filename)
csvfile2 = open(store_states_filename)
store_data = csv.reader(csvfile, delimiter=',')
store_states_data = csv.reader(csvfile2, delimiter=',')

store_data = csv2dicts(store_data)
store_states_data = csv2dicts(store_states_data)
set_nan_as_string(store_data)
for index, val in enumerate(store_data):
    state = store_states_data[index]
    val['State'] = state['State']
    store_data[index] = val

print(store_data[:2])

['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}]


# Prepare features

In [5]:
random.seed(42)

def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]

In [6]:
train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))
print(min(train_data_y), max(train_data_y))

Number of train datapoints:  844338
46 41551


In [7]:
full_X = np.array(train_data_X)
train_data_X = np.array(train_data_X)
full_X.shape

(844338, 8)

In [8]:
les = []
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

In [9]:
train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)
# les y  (train_data_X, train_data_y)

In [10]:
print(train_data_X[0], train_data_y[0])

[  0 109   1   0   0   0   0   7] 5961


In [11]:
np.random.seed(123)

sys.setrecursionlimit(10000)

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

(X, y) = train_data_X, train_data_y

num_records = len(X)
train_size = int(train_ratio * num_records)

In [12]:
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    np.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, saved_embeddings_fname)

if one_hot_as_input:
    print("Using one-hot encoding as input")
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X = enc.transform(X)

X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

print(X_train.shape, X_val.shape)

(759904, 8) (84434, 8)


In [13]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = np.random.randint(num_row, size=n)
    return X[indices, :], y[indices]


X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))

Number of samples used for training: 200000


In [23]:
y_train

array([ 5580,  2953,  6842, ...,  5025,  8875, 15263])

In [24]:
X_val

array([[  0,  89,   5, ...,   7,  11,   4],
       [  0,  88,   5, ...,   7,  11,   1],
       [  0,  87,   5, ...,   7,  11,   6],
       ...,
       [  0, 338,   4, ...,   9,  24,   6],
       [  0, 227,   4, ...,   9,  24,  11],
       [  0,   0,   4, ...,   9,  24,   4]])

In [15]:
from models import *

Using TensorFlow backend.


In [16]:
models = []

print("Fitting NN_with_EntityEmbedding...")
for i in range(5):
    models.append(NN_with_EntityEmbedding(X_train, y_train, X_val, y_val))

Fitting NN_with_EntityEmbedding...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.1002210694024205
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.09905058741363894
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.09908320907579495
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.10695848624747921
Train 

In [20]:
# print("Fitting NN...")
# for i in range(5):
#     models.append(NN(X_train, y_train, X_val, y_val))

# print("Fitting RF...")
# models.append(RF(X_train, y_train, X_val, y_val))

# print("Fitting KNN...")
# models.append(KNN(X_train, y_train, X_val, y_val))

# print("Fitting XGBoost...")
# models.append(XGBoost(X_train, y_train, X_val, y_val))


if save_embeddings:
    model = models[0].model
    store_embedding = model.get_layer('store_embedding').get_weights()[0]
    dow_embedding = model.get_layer('dow_embedding').get_weights()[0]
    year_embedding = model.get_layer('year_embedding').get_weights()[0]
    month_embedding = model.get_layer('month_embedding').get_weights()[0]
    day_embedding = model.get_layer('day_embedding').get_weights()[0]
    german_states_embedding = model.get_layer('state_embedding').get_weights()[0]
    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, dow_embedding, year_embedding,
                     month_embedding, day_embedding, german_states_embedding], f, -1)


def evaluate_models(models, X, y):
    assert(min(y) > 0)
    guessed_sales = numpy.array([model.guess(X) for model in models])
    mean_sales = guessed_sales.mean(axis=0)
    relative_err = numpy.absolute((y - mean_sales) / y)
    result = numpy.sum(relative_err) / len(y)
    return result


print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.0635817444220401
Validation error...
0.09371728533670369


In [21]:
models = []

print("Fitting XGBoost...")
for i in range(5):
    models.append(XGBoost(X_train, y_train, X_val, y_val))

Fitting NN_with_EntityEmbedding...
[0]	train-rmse:8.09749
[1]	train-rmse:7.9359
[2]	train-rmse:7.77765
[3]	train-rmse:7.62246
[4]	train-rmse:7.4705
[5]	train-rmse:7.3215
[6]	train-rmse:7.17552
[7]	train-rmse:7.03238
[8]	train-rmse:6.89226
[9]	train-rmse:6.75484
[10]	train-rmse:6.62021
[11]	train-rmse:6.48829
[12]	train-rmse:6.35892
[13]	train-rmse:6.23229
[14]	train-rmse:6.10812
[15]	train-rmse:5.98642
[16]	train-rmse:5.86726
[17]	train-rmse:5.75041
[18]	train-rmse:5.63594
[19]	train-rmse:5.52371
[20]	train-rmse:5.41381
[21]	train-rmse:5.30607
[22]	train-rmse:5.20053
[23]	train-rmse:5.09708
[24]	train-rmse:4.99571
[25]	train-rmse:4.89641
[26]	train-rmse:4.79906
[27]	train-rmse:4.70374
[28]	train-rmse:4.61034
[29]	train-rmse:4.51876
[30]	train-rmse:4.42905
[31]	train-rmse:4.34113
[32]	train-rmse:4.25497
[33]	train-rmse:4.17053
[34]	train-rmse:4.08782
[35]	train-rmse:4.00678
[36]	train-rmse:3.92732
[37]	train-rmse:3.84946
[38]	train-rmse:3.77316
[39]	train-rmse:3.69847
[40]	train-rmse:3.

KeyboardInterrupt: 