In [2]:
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
import keras
from keras import backend as K
import tensorflow as tf
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# Put all categorical data first for easier implementation of One Hot Encoding.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

In [3]:
%%sql --module q_all

SELECT *, HASH(CAST(district_id AS STRING) +timeslot) AS hash_value,
  IF(ABS(HASH(CAST(district_id AS STRING) + timeslot)) % 2 == 1, 'True', 'False')
    AS included_in_sample, IF(timeofday_slot >= 50 AND timeofday_slot <= 53, 1, 0) AS busy_time
FROM [datalab-projects-1331:xjk_algo_comp.gaps]
WHERE gap > 0

# The above query randomizes its outputs.

In [4]:
query = bq.Query(q_all)
tableresult = query.results()

all_data = np.zeros((tableresult.length, len(fields)))
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  for fcounter, field in enumerate(fields):
    all_data[rcounter, fcounter] = row[field]
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)
all_data_original = np.copy(all_data)

there are 102592 rows
processed 0 rows
processed 5000 rows
processed 10000 rows
processed 15000 rows
processed 20000 rows
processed 25000 rows
processed 30000 rows
processed 35000 rows
processed 40000 rows
processed 45000 rows
processed 50000 rows
processed 55000 rows
processed 60000 rows
processed 65000 rows
processed 70000 rows
processed 75000 rows
processed 80000 rows
processed 85000 rows
processed 90000 rows
processed 95000 rows
processed 100000 rows


In [6]:
# This chunk does further wrangling to dataset to produce training and test sets.

# Useful code to check NaN and Inf values. This is needed since these values would
# cause "Input contains NaN, infinity or a value too large for dtype('float32')
# errors when left unchecked.
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Impute all NaN with numbers (not sure what to replace inf yet)
all_data[np.isnan(all_data)] = 0
# all_data[np.isinf(all_data)] = 0

# See that NaN and Inf values replaced
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Split the data into train and test sets.
data_size = all_data.shape[0]
training_size = data_size * 90/100
indices = np.random.permutation(data_size)
training_idx, test_idx = indices[:training_size], indices[training_size:]
all_data_train, all_data_test = all_data[training_idx,:], all_data[test_idx,:]

data_train = all_data_train[:,1:]
targets_train = all_data_train[:,0]
data_test = all_data_test[:,1:]
targets_test = all_data_test[:,0]
data_train_original = np.copy(data_train)
data_test_original = np.copy(data_test)

Checkinf for NaN and Inf
np.nan= (array([     5,      5,     19, ..., 102591, 102591, 102591]), array([24, 25, 24, ..., 21, 22, 23]))
is.inf= (array([], dtype=int64), array([], dtype=int64))
np.max= nan
Checkinf for NaN and Inf
np.nan= (array([], dtype=int64), array([], dtype=int64))
is.inf= (array([], dtype=int64), array([], dtype=int64))
np.max= 461563.0


In [19]:
from sklearn.preprocessing import OneHotEncoder
# one_hot = OneHotEncoder(categorical_features=[0, 1, 14, 17, 20], n_values='auto')
one_hot = OneHotEncoder(categorical_features=[0, 1, 2, 3], n_values='auto')
one_hot.fit(data_train_original)
print "n_values_:"
print one_hot.n_values_
print "feature_indices_:"
print one_hot.feature_indices_
data_train = one_hot.transform(data_train_original).todense()
data_test = one_hot.transform(data_test_original).todense()
n_features = data_train.shape[1]
print 'new number of features: {}'.format(n_features)

n_values_:
[ 7 10 10 10]
feature_indices_:
[ 0  7 17 27 37]
new number of features: 193


In [64]:
def mape(y, predictions):
  return K.mean(K.abs(y-predictions/K.clip(K.abs(y), K.epsilon(), np.inf)))

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Activation
import time

model = Sequential()
model.add(Dense(128, input_dim=data_train.shape[1], init='uniform', activation='relu'))
model.add(Activation('tanh'))
model.add(Dense(1, input_dim=data_train.shape[1]))
model.compile(loss=mape, optimizer='rmsprop')

In [None]:
model.fit(data_train, targets_train, nb_epoch=100, batch_size=10)
time.sleep(0.5)

Epoch 1/100
Epoch 2/100
Epoch 3/100

In [67]:
scores = model.evaluate(data_train, targets_train)
print("\nscore: %.2f%" % (scores))


score: 14.24%


In [66]:
scores = model.evaluate(data_test, targets_test)
print("\nscore: %.2f%" % (scores))

score: 14.42%


In [50]:
predictions = model.predict(data_test)

In [61]:
print(predictions[0:100])

[[  4.25993347]
 [  1.723194  ]
 [  3.08974719]
 [  2.06029487]
 [  4.25993347]
 [  3.43693614]
 [  3.08974719]
 [  8.99430656]
 [  1.963346  ]
 [  1.68299305]
 [  2.06029487]
 [  1.17427683]
 [  3.08974719]
 [  3.43693614]
 [  1.963346  ]
 [  1.78467679]
 [  3.08974719]
 [  1.963346  ]
 [  5.65802193]
 [  4.85595322]
 [  4.01513672]
 [  3.43693614]
 [  1.963346  ]
 [  4.85595322]
 [  4.25993347]
 [  3.43693614]
 [  1.91827106]
 [  3.9544456 ]
 [  1.963346  ]
 [  8.94800949]
 [  1.963346  ]
 [  1.963346  ]
 [  3.43693614]
 [  1.963346  ]
 [  2.06029487]
 [  3.43693614]
 [  1.963346  ]
 [  1.963346  ]
 [  1.78467679]
 [  2.11109519]
 [  2.06029487]
 [  1.963346  ]
 [  3.9544456 ]
 [  1.91827106]
 [  1.963346  ]
 [  2.06029487]
 [  1.963346  ]
 [  1.963346  ]
 [  1.963346  ]
 [  4.25993347]
 [  2.11109519]
 [  8.94800949]
 [  3.43693614]
 [  2.06029487]
 [  4.85595322]
 [  3.9544456 ]
 [ 10.35829258]
 [  9.80030155]
 [  1.963346  ]
 [  3.08974719]
 [  1.723194  ]
 [  1.963346  ]
 [  5.99