In [None]:
import gcp.bigquery as bq
import gcp.storage as storage
import numpy as np
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# First feature HAS to be 'district_id' for MAPE calculation.
fields_str = """gap, district_id, sum_price, traffic_tj_level3, traffic_tj_level2, traffic_tj_level4, f23,
  f16_10,f17_3, f19, f5_3, f11_3, f24_2, f20_2, f13_4, f4, f4_5, f11_2, f25, f25_3, f7, f11_4,
  f23_3, f17_4, f8_4, f15_7, f4_17, f8_2, f4_10, f15_2, f11_6, f15, f17_5, f1_4, f24_1, f1,
  f16_11, f19_3, f23_6, f2_2, f11, f15_3, f8_3, f15_6, f6_1, f6_4, f8, f11_8, f25_8, f23_4,
  f17_2, f1_5, f4_9, f1_8, f4_11, f20_8, f2_5, f16, f25_9, f6_2, f14_3, f15_4, f4_7, f5_1, f5,
  f4_18, f14_8, f11_7, f3_2, f20_5, f17, f11_1, f4_14, f23_5, f1_7, traffic_tj_level1, f23_2,
  f20, f1_3, f13_8, f1_2, f8_1, f23_1, f11_5, f2_12, f4_1, f2_7, f22_1, f25_7
"""
fields = map(lambda x: x.strip(), fields_str.split(','))
features = fields[1:]

# Prepare Final Dataset

In [None]:
%%sql --module q_all_f

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.future_gaps_processed]
WHERE gap > 0 AND timeslot IN ('2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142')
ORDER BY timeslot, district_id

# Final dataset - Used in final submission.

In [None]:
query_f = bq.Query(q_all_f)
tableresult_f = query_f.results()

all_data_f = np.zeros((tableresult_f.length, len(fields)))
print 'there are {} rows'.format(tableresult_f.length)
for rcounter, row in enumerate(tableresult_f):
  for fcounter, field in enumerate(fields):
    all_data_f[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)
    
all_data_f[np.isnan(all_data_f)] = 0
data_final = all_data_f[:,1:]
targets_final = all_data_f[:,0]

# Run Prediction and Export CSV

In [None]:
final_est = pickle.load(open(EST_PICKLE_FILENAME, "r") )

In [None]:
%%sql --module q_districts

SELECT district_id FROM [datalab-projects-1331:xjk_algo_comp_test.districts] ORDER BY district_id

In [None]:
query_districts = bq.Query(q_districts)
tableresult_districts = query_districts.results()
districts = [d['district_id'] for d in tableresult_districts]

In [None]:
final_predictions = final_est.predict(data_final)

slots = ['2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142']

result_dicts = []
print "Preparing results..."
for slot in slots:
  for district in districts:
#     pred = 0.0
#     try:
#       id = (index for (index, item) in enumerate(tableresult_f) if 
#             item['timeslot'] == slot and item['district_id'] == id).next()
#       pred = final_predictions[id]
#     except:
#       pass
    result_dicts.append({'key': '{}:{}'.format(district, slot), 'value': \
      '{},{},{}'.format(district, slot,0.0)})

print 'Replacing predictions...'
for (index, item) in enumerate(tableresult_f):
  id = (idx for (r_id, r_item) in enumerate(result_dicts) if 
    r_item['key'] == '{}:{}'.format(item['district_id'], item['timeslot'])).next()
  print "id: {}, pred: {}".format(id, final_predictions[index])
  result_dicts[id]['value'] = \
    '{},{},{}'.format(item['district_id'], item['timeslot'],final_predictions[index])
  if index % 200 == 0:
    print 'Done {}/{}...'.format(index, tableresult_f.length)
    
result = '\n'.join(map(lambda d: d['value'], result_dicts))
bucketname = 'datalab-projects-1331-datalab'
itempath = 'result/final_result.csv'
print 'Done, now writing to gs://{}/{}'.format(bucketname, itempath)
item = storage.Item(bucketname, itempath)
item.write_to(result, 'text/plain')

In [None]:
print(result)