In [15]:
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD
import tensorflow as tf
import h5py
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
import time
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import matplotlib.pyplot as plt
from keras.layers import Dropout

HDF_FILENAME = 'final_model3.hdf5'
JSON_MODEL_FILENAME = 'final_model.json'

# Put all categorical data first for easier implementation of One Hot Encoding.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

# Prepare Final Dataset

In [2]:
%%sql --module q_all_f

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.gaps]
ORDER BY timeslot, district_id

# Final dataset - Used in final submission.

In [5]:
query_f = bq.Query(q_all_f)
tableresult_f = query_f.results()

all_data_f = np.zeros((tableresult_f.length, len(fields)))
print 'there are {} rows'.format(tableresult_f.length)
for rcounter, row in enumerate(tableresult_f):
  for fcounter, field in enumerate(fields):
    all_data_f[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)
    
all_data_f[np.isnan(all_data_f)] = 0
data_final_original = all_data_f[:,1:]
targets_final = all_data_f[:,0]

there are 2838 rows
processed 0 rows
processed 1000 rows
processed 2000 rows


In [8]:
steps = [
  ('impute', Imputer()),
  # Remember that gap, the first variable, is removed, thus categorical_features
  # should start at index 0.
  ('one_hot', OneHotEncoder(categorical_features=[0, 1, 2, 3], sparse=False,
                           n_values=[7, 10, 10, 10])),
  ('scale', StandardScaler()),
#   ('pca', PCA(n_components=120)),
#   ('estimate', final_model)
]
transformer = Pipeline(steps)
data_final = transformer.fit_transform(data_final_original)

# Run Prediction and Export CSV

In [16]:
def mape(y, predictions):
  return K.mean(K.abs(y-predictions/K.clip(K.abs(y), K.epsilon(), np.inf)), axis=-1)

epochs = 50

def get_optimizer(epochs=50):
  learning_rate = 0.2
  decay_rate = learning_rate / epochs
  momentum = 0.8
  return SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)

from keras.models import model_from_json
json_file = open(JSON_MODEL_FILENAME, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(HDF_FILENAME)
print("Loaded model from disk")

# evaluate loaded model on test data
loaded_model.compile(loss=mape, optimizer=get_optimizer(epochs))

Loaded model from disk


In [17]:
%%sql --module q_districts

SELECT district_id FROM [datalab-projects-1331:xjk_algo_comp_test.districts] ORDER BY district_id

In [18]:
query_districts = bq.Query(q_districts)
tableresult_districts = query_districts.results()
districts = [d['district_id'] for d in tableresult_districts]

In [19]:
final_predictions = loaded_model.predict(data_final)

item = storage.Item('datalab-projects-1331-datalab','data/timeslots_to_test2.txt')
slots = map(lambda x: x.strip(), item.read_from().strip().split('\n'))

result_dicts = []
print "Preparing results..."
for slot in slots:
  for district in districts:
#     pred = 0.0
#     try:
#       id = (index for (index, item) in enumerate(tableresult_f) if 
#             item['timeslot'] == slot and item['district_id'] == id).next()
#       pred = final_predictions[id]
#     except:
#       pass
    result_dicts.append({'key': '{}:{}'.format(district, slot), 'value': \
      '{},{},{}'.format(district, slot,0.0)})

print 'Replacing predictions...'
for (index, item) in enumerate(tableresult_f):
  id = (r_id for (r_id, r_item) in enumerate(result_dicts) if 
    r_item['key'] == '{}:{}'.format(item['district_id'], item['timeslot'])).next()
#   print "{}. id: {}, pred: {}".format(index, id, final_predictions[index])
  result_dicts[id]['value'] = \
    '{},{},{}'.format(item['district_id'], item['timeslot'],final_predictions[index])
  if index % 200 == 0:
    print 'Done {}/{}...'.format(index, tableresult_f.length)
    
result = '\n'.join(map(lambda d: d['value'], result_dicts))
bucketname = 'datalab-projects-1331-datalab'
itempath = 'result/final_result.csv'
print 'Done, now writing to gs://{}/{}'.format(bucketname, itempath)
item = storage.Item(bucketname, itempath)
item.write_to(result, 'text/plain')

Preparing results...
Replacing predictions...
Done 0/2838...
Done 200/2838...
Done 400/2838...
Done 600/2838...
Done 800/2838...
Done 1000/2838...
Done 1200/2838...
Done 1400/2838...
Done 1600/2838...
Done 1800/2838...
Done 2000/2838...
Done 2200/2838...
Done 2400/2838...
Done 2600/2838...
Done 2800/2838...
Done, now writing to gs://datalab-projects-1331-datalab/result/final_result.csv


In [20]:
print(result)

1,2016-01-23-46,[ 1.92343402]
2,2016-01-23-46,[ 0.99938875]
3,2016-01-23-46,[ 0.99938875]
4,2016-01-23-46,[ 0.99938875]
5,2016-01-23-46,[ 0.99938875]
6,2016-01-23-46,[ 0.99938875]
7,2016-01-23-46,[ 3.70429254]
8,2016-01-23-46,[ 10.38765717]
9,2016-01-23-46,[ 0.99938875]
10,2016-01-23-46,[ 0.99938875]
11,2016-01-23-46,[ 0.99938875]
12,2016-01-23-46,[ 2.79093122]
13,2016-01-23-46,[ 0.99938875]
14,2016-01-23-46,[ 0.99938875]
15,2016-01-23-46,[ 0.99938875]
16,2016-01-23-46,[ 0.99938875]
17,2016-01-23-46,[ 0.99938875]
18,2016-01-23-46,[ 0.99938875]
19,2016-01-23-46,[ 0.99938875]
20,2016-01-23-46,[ 1.50260401]
21,2016-01-23-46,[ 0.99938875]
22,2016-01-23-46,[ 1.03290844]
23,2016-01-23-46,[ 69.69689178]
24,2016-01-23-46,[ 1.25974715]
25,2016-01-23-46,[ 0.99938875]
26,2016-01-23-46,[ 0.99938875]
27,2016-01-23-46,[ 1.05406952]
28,2016-01-23-46,[ 3.23237467]
29,2016-01-23-46,[ 0.99938875]
30,2016-01-23-46,[ 0.99938875]
31,2016-01-23-46,[ 0.99938875]
32,2016-01-23-46,[ 0.99938875]
33,2016-01-23-4