In [17]:
import gcp.bigquery as bq
import gcp.storage as storage
import numpy as np
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# First feature HAS to be 'district_id' for MAPE calculation.
fields = ['gap', 'district_id', 'timeofday_slot', 'day_in_week', 'sum_price', 'traffic_tj_level1',
            'traffic_tj_level2', 'traffic_tj_level3', 'traffic_tj_level4', 'is_sunday']
features = fields[1:]

# Prepare Final Dataset

In [3]:
%%sql --module q_all_f

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.future_gaps_processed]
WHERE gap > 0 AND timeslot IN ('2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142')
ORDER BY timeslot, district_id

# Final dataset - Used in final submission.

In [14]:
query_f = bq.Query(q_all_f)
tableresult_f = query_f.results()

all_data_f = np.zeros((tableresult_f.length, len(fields)))
print 'there are {} rows'.format(tableresult_f.length)
for rcounter, row in enumerate(tableresult_f):
  for fcounter, field in enumerate(fields):
    all_data_f[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)
    
all_data_f[np.isnan(all_data_f)] = 0
data_final = all_data_f[:,1:]
targets_final = all_data_f[:,0]

there are 1802 rows
processed 0 rows
processed 1000 rows


# Run Prediction and Export CSV

In [None]:
final_est = pickle.load(open(EST_PICKLE_FILENAME, "r") )

In [11]:
%%sql --module q_districts

SELECT district_id FROM [datalab-projects-1331:xjk_algo_comp_test.districts] ORDER BY district_id

In [12]:
query_districts = bq.Query(q_districts)
tableresult_districts = query_districts.results()
districts = [d['district_id'] for d in tableresult_districts]

In [22]:
final_predictions = final_est.predict(data_final)

slots = ['2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142']

result_dicts = []
print "Preparing results..."
for slot in slots:
  for district in districts:
#     pred = 0.0
#     try:
#       id = (index for (index, item) in enumerate(tableresult_f) if 
#             item['timeslot'] == slot and item['district_id'] == id).next()
#       pred = final_predictions[id]
#     except:
#       pass
    result_dicts.append({'key': '{}:{}'.format(district, slot), 'value': \
      '{},{},{}'.format(district, slot,0.0)})

print 'Replacing predictions...'
for (index, item) in enumerate(tableresult_f):
  id = (index for (r_id, r_item) in enumerate(result_dicts) if 
    r_item['key'] == '{}:{}'.format(item['district_id'], item['timeslot'])).next()
  result_dicts[id]['value'] = \
    '{},{},{}'.format(item['district_id'], item['timeslot'],final_predictions[index])
  if index % 200 == 0:
    print 'Done {}/{}...'.format(index, tableresult_f.length)
    
result = '\n'.join(map(lambda d: d['value'], result_dicts))
bucketname = 'datalab-projects-1331-datalab'
itempath = 'result/final_result.csv'
print 'Done, now writing to gs://{}/{}'.format(bucketname, itempath)
item = storage.Item(bucketname, itempath)
item.write_to(result, 'text/plain')

Preparing results...
Replacing predictions...
Done 0/1802...
Done 200/1802...
Done 400/1802...
Done 600/1802...
Done 800/1802...
Done 1000/1802...
Done 1200/1802...
Done 1400/1802...
Done 1600/1802...
Done 1800/1802...
Done, now writing to gs://datalab-projects-1331-datalab/result/final_result.csv


In [23]:
print(result)

1,2016-01-22-106,14.0
2,2016-01-22-106,1.0
3,2016-01-22-106,3.0
4,2016-01-22-106,11.0
7,2016-01-22-106,316.0
8,2016-01-22-106,676.0
9,2016-01-22-106,1.0
11,2016-01-22-106,1.0
12,2016-01-22-106,14.0
13,2016-01-22-106,2.0
14,2016-01-22-106,5.0
16,2016-01-22-106,2.0
17,2016-01-22-106,1.0
18,2016-01-22-106,3.0
19,2016-01-22-106,7.0
20,2016-01-22-106,5.0
21,2016-01-22-106,13.0
22,2016-01-22-106,27.0
23,2016-01-22-106,434.0
24,2016-01-22-106,9.0
25,2016-01-22-106,1.0
26,2016-01-22-106,92.0
27,2016-01-22-106,8.0
28,2016-01-22-106,149.0
29,2016-01-22-106,19.0
31,2016-01-22-106,4.0
32,2016-01-22-106,3.0
33,2016-01-22-106,2.0
35,2016-01-22-106,18.0
36,2016-01-22-106,3.0
37,2016-01-22-106,434.0
38,2016-01-22-106,3.0
41,2016-01-22-106,1.0
42,2016-01-22-106,4.0
43,2016-01-22-106,1.0
44,2016-01-22-106,1.0
46,2016-01-22-106,149.0
47,2016-01-22-106,2.0
48,2016-01-22-106,434.0
51,2016-01-22-106,910.0
53,2016-01-22-106,9.0
57,2016-01-22-106,1.0
62,2016-01-22-106,1.0
66,2016-01-22-106,1.0
1,2016-01-22-11