In [None]:
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD
import tensorflow as tf
import h5py
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
import time
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

try:
  import cPickle as pickle
except:
  import pickle
HDF_FILENAME = 'best_model.hdf5'
EST_FILENAME = 'final_estimator.pkl'

seed = 13
np.random.seed(seed)

# 1. Load Data

In [2]:
# Put all categorical data first for easier implementation of One Hot Encoding.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

In [3]:
%%sql --module q_all

SELECT *, HASH(CAST(district_id AS STRING) +timeslot) AS hash_value,
  IF(ABS(HASH(CAST(district_id AS STRING) + timeslot)) % 2 == 1, 'True', 'False')
    AS included_in_sample, IF(timeofday_slot >= 50 AND timeofday_slot <= 53, 1, 0) AS busy_time
FROM [datalab-projects-1331:xjk_algo_comp.gaps]
WHERE gap > 0

# The above query randomizes its outputs.

In [4]:
query = bq.Query(q_all)
tableresult = query.results()

all_data = np.zeros((tableresult.length, len(fields)))
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  for fcounter, field in enumerate(fields):
    all_data[rcounter, fcounter] = row[field]
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)
all_data_original = np.copy(all_data)

there are 102592 rows
processed 0 rows
processed 5000 rows
processed 10000 rows
processed 15000 rows
processed 20000 rows
processed 25000 rows
processed 30000 rows
processed 35000 rows
processed 40000 rows
processed 45000 rows
processed 50000 rows
processed 55000 rows
processed 60000 rows
processed 65000 rows
processed 70000 rows
processed 75000 rows
processed 80000 rows
processed 85000 rows
processed 90000 rows
processed 95000 rows
processed 100000 rows


# 2. Preprocess Data

Note that in the end we don't use data adjusted from Preprocessing steps, but rather including the preprocessing instances in Pipeline to be applied to original data.

## 2.1. Impute NaN and inf values

In [5]:
all_data = np.copy(all_data_original)

In [6]:
from sklearn.preprocessing import Imputer
# Useful code to check NaN and Inf values. This is needed since these values would
# cause "Input contains NaN, infinity or a value too large for dtype('float32')
# errors when left unchecked.
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Impute
imputer = Imputer()
imputer.fit(all_data)
print 'Shape of old data:'
print all_data.shape
print 'Stats (NaN values are replaced with following means for each feature):'
print imputer.statistics_
all_data = imputer.fit_transform(all_data)
print 'Shape of new data:'
print all_data.shape

# See that NaN and Inf values replaced
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

Checkinf for NaN and Inf
np.nan= (array([     5,      5,     19, ..., 102591, 102591, 102591]), array([24, 25, 24, ..., 21, 22, 23]))
is.inf= (array([], dtype=int64), array([], dtype=int64))
np.max= nan
Shape of old data:
(102592, 164)
Stats (NaN values are replaced with following means for each feature):
[  1.47531386e+01   3.01818855e+00   3.10143084e+00   3.09452721e+00
   3.09335324e+00   3.55680755e-02   9.25288152e+02   2.11476701e+02
   6.39290514e+01   4.27323248e+01   9.24878845e+02   2.11314586e+02
   6.38820215e+01   4.27167737e+01   9.24081034e+02   2.11042060e+02
   6.38010039e+01   4.26568256e+01   6.73477959e+00   1.22643715e+02
   6.72686319e+00   1.22560244e+02   6.72055706e+00   1.22575663e+02
   1.49658681e+01   1.38741400e+03   1.49710991e+01   1.39304667e+03
   1.49745991e+01   1.39496361e+03   1.63135306e+03   7.13633744e+03
   1.28722165e+03   3.13526206e+03   3.60085879e+03   2.13496506e+04
   1.04434925e+03   7.91680944e+03   1.61694424e+03   1.98460352e+04
   

## 2.2. One Hot Encoding

In [7]:
one_hot = OneHotEncoder(categorical_features=[1, 2, 3, 4], n_values='auto')
one_hot.fit(all_data)
print "n_values_:"
print one_hot.n_values_
print "feature_indices_:"
print one_hot.feature_indices_
all_data = one_hot.fit_transform(all_data).todense()
n_features = all_data.shape[1] - 1
print 'new number of features: {}'.format(n_features)

n_values_:
[ 7 10 10 10]
feature_indices_:
[ 0  7 17 27 37]
new number of features: 191


## 2.3. Feature Scaling

In [8]:
scaler = StandardScaler()
print 'old data:'
print 'means: {}, min: {}, max: {}'.format(np.mean(all_data), np.min(all_data), np.max(all_data))
all_data = scaler.fit_transform(all_data)
print 'new data:'
print 'means: {}, min: {}, max: {}'.format(np.mean(all_data), np.min(all_data), np.max(all_data))

old data:
means: 2519.97764716, min: 0.0, max: 461563.0
new data:
means: 2.71320372218e-15, min: -1.94360466344, max: 62.211464139


# 3. Split into Train and Test Data

In [9]:
# Split the originaldata into train and test sets.
data_size = all_data_original.shape[0]
training_size = data_size * 90/100
indices = np.random.permutation(data_size)
training_idx, test_idx = indices[:training_size], indices[training_size:]
all_data_train, all_data_test = all_data_original[training_idx,:], all_data_original[test_idx,:]

data_train = all_data_train[:,1:]
targets_train = all_data_train[:,0]
data_test = all_data_test[:,1:]
targets_test = all_data_test[:,0]
data_train_original = np.copy(data_train)
data_test_original = np.copy(data_test)

# 4. Model Creation

## 4.1. Loss Function

In [10]:
def mape(y, predictions):
  return K.mean(K.abs(y-predictions/K.clip(K.abs(y), K.epsilon(), np.inf)), axis=-1)

## 4.2. Create Pipeline

In [11]:
def create_model(optimizer='adagrad', init='uniform'):
  model = Sequential()
  model.add(Dense(n_features, input_dim=n_features, init=init, activation='relu'))
  model.add(Dense(1024, init=init, activation='relu'))
  model.add(Dense(512, init=init, activation='relu'))
  model.add(Dense(1))
  model.compile(loss=mape, optimizer=optimizer, metrics=['accuracy'])
  return model

checkpoint = ModelCheckpoint(HDF_FILENAME, monitor='loss', save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model = KerasRegressor(build_fn=create_model, nb_epoch=20, batch_size=10, verbose=0,
                      callbacks=callbacks_list)

steps = [
  ('impute', Imputer()),
  ('one_hot', OneHotEncoder(categorical_features=[1, 2, 3, 4], n_values='auto')),
  ('scale', StandardScaler())
#   ('pca', PCA(n_components=120)),
  ('estimate', model)
]

# kfold = StratifiedKFold(y=targets_train, n_folds=10, shuffle=True, random_state=seed)

## 4.3. Test Suite

In [None]:
# model.fit(data_train, targets_train, validation_split=0.33,  nb_epoch=100, batch_size=10, 
#           callbacks=callbacks_list)
# results = cross_val_score(model, data_train, targets_train, cv=kfold)
epochs = 150
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8
sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)

data_train_s = data_train[:1000]
targets_train_s = targets_train[:1000]
optimizers = [sgd, 'adam', 'adagrad']
init = ['glorot_uniform', 'normal', 'uniform']
epochs_r = np.array([epochs])
batches = np.array([5, 10, 20])
param_grid = dict(optimizer=optimizers, nb_epoch=epochs_r, 
                  batch_size=batches, init=init)

grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(data_train_s, targets_train_s)
time.sleep(0.5)

0s - loss: 0.1587 - acc: 0.0000e+00
Epoch 46/150
0s - loss: 0.1649 - acc: 0.0000e+00
Epoch 47/150
0s - loss: 0.1461 - acc: 0.0000e+00
Epoch 48/150
0s - loss: 0.1470 - acc: 0.0000e+00
Epoch 49/150
0s - loss: 0.1387 - acc: 0.0000e+00
Epoch 50/150
0s - loss: 0.1369 - acc: 0.0000e+00
Epoch 51/150
0s - loss: 0.1256 - acc: 0.0000e+00
Epoch 52/150
0s - loss: 0.1086 - acc: 0.0000e+00
Epoch 53/150
0s - loss: 0.0965 - acc: 0.0000e+00
Epoch 54/150
0s - loss: 0.1113 - acc: 0.0000e+00
Epoch 55/150
0s - loss: 0.0963 - acc: 0.0000e+00
Epoch 56/150
0s - loss: 0.0930 - acc: 0.0000e+00
Epoch 57/150
0s - loss: 0.1001 - acc: 0.0000e+00
Epoch 58/150
0s - loss: 0.0888 - acc: 0.0000e+00
Epoch 59/150
0s - loss: 0.0901 - acc: 0.0000e+00
Epoch 60/150
0s - loss: 0.0830 - acc: 0.0000e+00
Epoch 61/150
0s - loss: 0.0886 - acc: 0.0000e+00
Epoch 62/150
0s - loss: 0.0796 - acc: 0.0000e+00
Epoch 63/150
0s - loss: 0.0823 - acc: 0.0000e+00
Epoch 64/150
0s - loss: 0.0824 - acc: 0.0000e+00
Epoch 65/150
0s - loss: 0.0812 - 

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

In [None]:
scores = model.evaluate(data_train, targets_train)
print("\nscore: %.2f%" % (scores))

In [None]:
scores = model.evaluate(data_test, targets_test)
print("\nscore: %.2f%" % (scores))

In [None]:
predictions = model.predict(data_test)

In [None]:
print(predictions[0:100])