In [1]:
import gcp.bigquery as bq
import gcp.storage as storage
import numpy as np
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'GradientBoostingRegressor_grid_best.pkl'

# First field HAS to be 'gap'.
# First feature HAS to be 'district_id' for MAPE calculation.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Prepare Final Dataset

In [2]:
%%sql --module q_all_f

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.gaps]
ORDER BY timeslot, district_id

# Final dataset - Used in final submission.

In [3]:
query_f = bq.Query(q_all_f)
tableresult_f = query_f.results()

all_data_f = np.zeros((tableresult_f.length, len(fields)))
print 'there are {} rows'.format(tableresult_f.length)
for rcounter, row in enumerate(tableresult_f):
  for fcounter, field in enumerate(fields):
    all_data_f[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)
    
all_data_f[np.isnan(all_data_f)] = 0
data_final = all_data_f[:,1:]
targets_final = all_data_f[:,0]

there are 2838 rows
processed 0 rows
processed 1000 rows
processed 2000 rows


# Run Prediction and Export CSV

In [4]:
final_est = pickle.load(open(EST_PICKLE_FILENAME, "r") )

In [5]:
%%sql --module q_districts

SELECT district_id FROM [datalab-projects-1331:xjk_algo_comp_test.districts] ORDER BY district_id

In [6]:
query_districts = bq.Query(q_districts)
tableresult_districts = query_districts.results()
districts = [d['district_id'] for d in tableresult_districts]

In [7]:
final_predictions = final_est.predict(data_final)

item = storage.Item('datalab-projects-1331-datalab','data/timeslots_to_test2.txt')
slots = map(lambda x: x.strip(), item.read_from().strip().split('\n'))

result_dicts = []
print "Preparing results..."
for slot in slots:
  for district in districts:
#     pred = 0.0
#     try:
#       id = (index for (index, item) in enumerate(tableresult_f) if 
#             item['timeslot'] == slot and item['district_id'] == id).next()
#       pred = final_predictions[id]
#     except:
#       pass
    result_dicts.append({'key': '{}:{}'.format(district, slot), 'value': \
      '{},{},{}'.format(district, slot,0.0)})

print 'Replacing predictions...'
for (index, item) in enumerate(tableresult_f):
  id = (r_id for (r_id, r_item) in enumerate(result_dicts) if 
    r_item['key'] == '{}:{}'.format(item['district_id'], item['timeslot'])).next()
#   print "{}. id: {}, pred: {}".format(index, id, final_predictions[index])
  result_dicts[id]['value'] = \
    '{},{},{}'.format(item['district_id'], item['timeslot'],final_predictions[index])
  if index % 200 == 0:
    print 'Done {}/{}...'.format(index, tableresult_f.length)
    
result = '\n'.join(map(lambda d: d['value'], result_dicts))
bucketname = 'datalab-projects-1331-datalab'
itempath = 'result/final_result_baseline.csv'
print 'Done, now writing to gs://{}/{}'.format(bucketname, itempath)
item = storage.Item(bucketname, itempath)
item.write_to(result, 'text/plain')

Preparing results...
Replacing predictions...
Done 0/2838...
Done 200/2838...
Done 400/2838...
Done 600/2838...
Done 800/2838...
Done 1000/2838...
Done 1200/2838...
Done 1400/2838...
Done 1600/2838...
Done 1800/2838...
Done 2000/2838...
Done 2200/2838...
Done 2400/2838...
Done 2600/2838...
Done 2800/2838...
Done, now writing to gs://datalab-projects-1331-datalab/result/final_result_baseline.csv


In [8]:
print(result)

1,2016-01-23-46,7.0
2,2016-01-23-46,1.0
3,2016-01-23-46,3.0
4,2016-01-23-46,1.0
5,2016-01-23-46,1.0
6,2016-01-23-46,3.0
7,2016-01-23-46,1.0
8,2016-01-23-46,7.0
9,2016-01-23-46,6.0
10,2016-01-23-46,2.0
11,2016-01-23-46,5.0
12,2016-01-23-46,6.0
13,2016-01-23-46,1.0
14,2016-01-23-46,8.0
15,2016-01-23-46,5.0
16,2016-01-23-46,3.0
17,2016-01-23-46,2.0
18,2016-01-23-46,4.0
19,2016-01-23-46,8.0
20,2016-01-23-46,5.0
21,2016-01-23-46,1.0
22,2016-01-23-46,1.0
23,2016-01-23-46,27.0
24,2016-01-23-46,22.0
25,2016-01-23-46,1.0
26,2016-01-23-46,10.0
27,2016-01-23-46,15.0
28,2016-01-23-46,28.0
29,2016-01-23-46,4.0
30,2016-01-23-46,1.0
31,2016-01-23-46,6.0
32,2016-01-23-46,6.0
33,2016-01-23-46,1.0
34,2016-01-23-46,2.0
35,2016-01-23-46,2.0
36,2016-01-23-46,1.0
37,2016-01-23-46,7.0
38,2016-01-23-46,1.0
39,2016-01-23-46,2.0
40,2016-01-23-46,1.0
41,2016-01-23-46,1.0
42,2016-01-23-46,15.0
43,2016-01-23-46,2.0
44,2016-01-23-46,4.0
45,2016-01-23-46,4.0
46,2016-01-23-46,15.0
47,2016-01-23-46,4.0
48,2016-01-23-4