In [31]:
import gcp.bigquery as bq
import gcp.storage as storage
import numpy as np
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# First feature HAS to be 'district_id' for MAPE calculation.
fields_str = """
district_id	timeofday_slot	day_in_week	is_sunday	sum_price	avg_price	poi1	poi2	poi3
	poi4	poi5	traffic_tj_level1	traffic_tj_level2	traffic_tj_level3	traffic_tj_level4
	weather	weather_pm25	weather_temperature	previous_gap	gap
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Prepare Final Dataset

In [32]:
%%sql --module q_all_f

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.future_gaps_final2]
WHERE gap > 0 AND timeslot IN ('2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142')
ORDER BY timeslot, district_id

# Final dataset - Used in final submission.

In [33]:
query_f = bq.Query(q_all_f)
tableresult_f = query_f.results()

all_data_f = np.zeros((tableresult_f.length, len(fields)))
print 'there are {} rows'.format(tableresult_f.length)
for rcounter, row in enumerate(tableresult_f):
  for fcounter, field in enumerate(fields):
    all_data_f[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)
    
all_data_f[np.isnan(all_data_f)] = 0
data_final = all_data_f[:,1:]
targets_final = all_data_f[:,0]

there are 1802 rows
processed 0 rows
processed 1000 rows


# Run Prediction and Export CSV

In [34]:
final_est = pickle.load(open(EST_PICKLE_FILENAME, "r") )

In [35]:
%%sql --module q_districts

SELECT district_id FROM [datalab-projects-1331:xjk_algo_comp_test.districts] ORDER BY district_id

In [36]:
query_districts = bq.Query(q_districts)
tableresult_districts = query_districts.results()
districts = [d['district_id'] for d in tableresult_districts]

In [37]:
final_predictions = final_est.predict(data_final)

slots = ['2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142']

result_dicts = []
print "Preparing results..."
for slot in slots:
  for district in districts:
#     pred = 0.0
#     try:
#       id = (index for (index, item) in enumerate(tableresult_f) if 
#             item['timeslot'] == slot and item['district_id'] == id).next()
#       pred = final_predictions[id]
#     except:
#       pass
    result_dicts.append({'key': '{}:{}'.format(district, slot), 'value': \
      '{},{},{}'.format(district, slot,0.0)})

print 'Replacing predictions...'
for (index, item) in enumerate(tableresult_f):
  id = (r_id for (r_id, r_item) in enumerate(result_dicts) if 
    r_item['key'] == '{}:{}'.format(item['district_id'], item['timeslot'])).next()
#   print "{}. id: {}, pred: {}".format(index, id, final_predictions[index])
  result_dicts[id]['value'] = \
    '{},{},{}'.format(item['district_id'], item['timeslot'],final_predictions[index])
  if index % 200 == 0:
    print 'Done {}/{}...'.format(index, tableresult_f.length)
    
result = '\n'.join(map(lambda d: d['value'], result_dicts))
bucketname = 'datalab-projects-1331-datalab'
itempath = 'result/final_result.csv'
print 'Done, now writing to gs://{}/{}'.format(bucketname, itempath)
item = storage.Item(bucketname, itempath)
item.write_to(result, 'text/plain')

Preparing results...
Replacing predictions...
Done 0/1802...
Done 200/1802...
Done 400/1802...
Done 600/1802...
Done 800/1802...
Done 1000/1802...
Done 1200/1802...
Done 1400/1802...
Done 1600/1802...
Done 1800/1802...
Done, now writing to gs://datalab-projects-1331-datalab/result/final_result.csv


In [38]:
print(result)

1,2016-01-22-46,12.1391057749
2,2016-01-22-46,14.9232563305
3,2016-01-22-46,0.0
4,2016-01-22-46,14.7380603413
5,2016-01-22-46,0.0
6,2016-01-22-46,23.0795541891
7,2016-01-22-46,48.0405759162
8,2016-01-22-46,12.1538461538
9,2016-01-22-46,9.99921660791
10,2016-01-22-46,26.9776536313
11,2016-01-22-46,16.3613324833
12,2016-01-22-46,12.5046052632
13,2016-01-22-46,36.528125
14,2016-01-22-46,13.9236453202
15,2016-01-22-46,0.0
16,2016-01-22-46,36.528125
17,2016-01-22-46,0.0
18,2016-01-22-46,0.0
19,2016-01-22-46,16.0
20,2016-01-22-46,14.875
21,2016-01-22-46,10.0011976048
22,2016-01-22-46,26.1600712122
23,2016-01-22-46,32.8097068121
24,2016-01-22-46,23.9379844961
25,2016-01-22-46,0.0
26,2016-01-22-46,0.0
27,2016-01-22-46,15.6344445772
28,2016-01-22-46,23.9379844961
29,2016-01-22-46,14.7380603413
30,2016-01-22-46,0.0
31,2016-01-22-46,46.4902712816
32,2016-01-22-46,0.0
33,2016-01-22-46,35.4684347543
34,2016-01-22-46,0.0
35,2016-01-22-46,0.0
36,2016-01-22-46,48.6882591093
37,2016-01-22-46,16.3613324