In [7]:
from sklearn.decomposition import PCA
import pandas as pd
import gcp.bigquery as bq
import numpy as np
from datetime import date
from datetime import timedelta

import pdb

poi_n_range = 10
poi_schema = []
for i in range(poi_n_range):
  poi_n = i+1
  poi_schema.append({'name': 'poi{}'.format(poi_n), 'type': 'FLOAT'})
  
prev_n_range = 5
prev_gap_schema = []
for i in range(prev_n_range):
  prev_n = i+1
  prev_gap_schema.append({'name': 'previous_gap{}'.format(prev_n), 'type': 'INTEGER'})

# Gaps Table
First, make a simple **gaps** table, then fill it with data from other tables.

In [None]:
%%bigquery udf -m orders_create_additional_fields
/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Create additional fields on orders table for gaps table creation.
 *
 * @param {{order_id: string, driver_id: string, passenger_id: string,
            start_district_hash: string, dest_district_hash: string, price: float,
            time: string}} r
 * @param function({{order_id: string, driver_id: string, passenger_id: string,
                     start_district_hash: string, dest_district_hash: string, price: float,
                     time: string, timeslot: string, timeofday_slot: integer, day_in_week: integer,
                     date: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  r.timeofday_slot = slot;
  r.date = t[0] + '-' + pad(t[1], 2) + '-' + pad(t[2], 2);
  r.day_in_week = new Date(parseInt(t[0]), parseInt(t[1])-1, parseInt(t[2])).getDay();
  emitFn(r);
}

In [None]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.gaps -m create
SELECT district_id, FIRST(orders.timeslot) AS timeslot, FIRST(orders.date) AS date,
  FIRST(day_in_week) AS day_in_week, FIRST(timeofday_slot) AS timeofday_slot,
  SUM(price) AS sum_price, AVG(price) AS avg_price,
  SUM(IF(driver_id = 'NULL', 1, 0)) AS gap
FROM orders_create_additional_fields([datalab-projects-1331:xjk_algo_comp.orders]) AS orders
JOIN [datalab-projects-1331:xjk_algo_comp.districts] AS districts 
  ON orders.start_district_hash = districts.district_hash
GROUP BY district_id, orders.timeslot

In [None]:
%% sql -q tester
SELECT SUM(price) FROM datalab-projects-1331:xjk_algo_comp.orders
WHERE start_district_hash = 'd4ec2125aff74eded207d2d915ef682f'
  AND (time LIKE "%2016-01-01 00:5%")
  
# Reference only. Use above query to check if aggregation was correct, by comparing the
# result of that from sum of district_hash = 51 and timeslot "2016-01-01-6", which 
# should be 126498.0.

# +1 timeslot
We are going to use past data to predict the gaps for the next 10 minutes (or in other words, the next timeslot). To do this, we need to update timeslot related information. Specifically we are going to do the following:
1. Add `timeslot` feature by one:
  - '2016-01-45' to '2016-01-46'
  - '2016-01-144' to '2016-02-1'
2. Update `date` and `slot` features accordingly.
3. Recalculate `day_in_week` feature.

In [None]:
%%bigquery udf --module gaps_plus_one_timeslot
  
// Use this when testing
// %%javascript

/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Add one timeslot and adjust other relevant tables.
 *
 * @param {{district_id: integer, timeslot: string, date: string, day_in_week: integer,
            timeofday_slot: integer, sum_price: float, avg_price: float, gap: integer}} r
 * @param function({{district_id: integer, timeslot: string, date: string, day_in_week: integer,
                timeofday_slot: integer, sum_price: float, avg_price: float, gap: integer,
                timeslot_original: string}}) emitFn
 */
function(r, emitFn) {
  
  var t = r.timeslot.split(/-/);
  var oldslot = parseInt(t[3]);
  var newslot = oldslot + 1;
  var d = new Date(parseInt(t[0]), parseInt(t[1])-1, parseInt(t[2]));
  if (oldslot == 144) {
    newslot = 1;
    d = new Date(d.setDate(d.getDate() + 1));
    r.day_in_week = d.getDay();
    r.date = d.getFullYear() + '-' + pad(d.getMonth()+1, 2) +
             '-' + pad(d.getDate(), 2);
  }
  
  r.timeslot_original = r.timeslot;

  r.timeslot = d.getFullYear() + '-' + pad(d.getMonth()+1, 2) +
               '-' + pad(d.getDate(), 2) + '-' + newslot;

  r.timeofday_slot = newslot;
  emitFn(r);
}

// For testing (without `new Date()` after date added by 1 setDate returns milliseconds)

// var test_row = {
//   district_id: 1,
//   timeslot: '2016-01-22-144',
//   date: '2016-01-22',
//   day_in_week: 5,
//   timeofday_slot: 144,
//   sum_price: 0.0,
//   avg_price: 0.0,
//   gap: 11,
// };

// function emitter(r) {
//   for (var p in r) {
//     element.append(p + '=' + r[p] + '<br>');
//   }
// }

// udf(test_row, emitter);

In [None]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.future_gaps -m create

SELECT district_id, timeslot, date, day_in_week, timeofday_slot, sum_price, avg_price, gap,
  timeslot_original
FROM gaps_plus_one_timeslot([datalab-projects-1331:xjk_algo_comp.gaps])

# Gaps Table + Other Tables

In [None]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.future_gaps_processed -m create

SELECT FIRST(gaps.district_id) AS district_id, FIRST(gaps.timeslot) AS timeslot, FIRST(gaps.date) AS date,
  FIRST(gaps.day_in_week) AS day_in_week, IF(FIRST(gaps.day_in_week) = 0, 1, 0) AS is_sunday,
  FIRST(gaps.timeofday_slot) AS timeofday_slot,
  FIRST(gaps.sum_price) AS sum_price, FIRST(gaps.avg_price) AS avg_price, FIRST(gaps.gap) AS gap,
  FIRST(pois.f1) AS f1, FIRST(pois.f11) AS f11, FIRST(pois.f11_1) AS f11_1, 
  FIRST(pois.f11_2) AS f11_2, FIRST(pois.f11_3) AS f11_3, FIRST(pois.f11_4) AS f11_4, 
  FIRST(pois.f11_5) AS f11_5, FIRST(pois.f11_6) AS f11_6, FIRST(pois.f11_7) AS f11_7, 
  FIRST(pois.f11_8) AS f11_8, FIRST(pois.f13_4) AS f13_4, FIRST(pois.f13_8) AS f13_8, 
  FIRST(pois.f14) AS f14, FIRST(pois.f14_1) AS f14_1, FIRST(pois.f14_10) AS f14_10, 
  FIRST(pois.f14_2) AS f14_2, FIRST(pois.f14_3) AS f14_3, FIRST(pois.f14_6) AS f14_6, 
  FIRST(pois.f14_8) AS f14_8, FIRST(pois.f15) AS f15, FIRST(pois.f15_1) AS f15_1, 
  FIRST(pois.f15_2) AS f15_2, FIRST(pois.f15_3) AS f15_3, FIRST(pois.f15_4) AS f15_4, 
  FIRST(pois.f15_6) AS f15_6, FIRST(pois.f15_7) AS f15_7, FIRST(pois.f15_8) AS f15_8, 
  FIRST(pois.f16) AS f16, FIRST(pois.f16_1) AS f16_1, FIRST(pois.f16_10) AS f16_10, 
  FIRST(pois.f16_11) AS f16_11, FIRST(pois.f16_12) AS f16_12, FIRST(pois.f16_3) AS f16_3, 
  FIRST(pois.f16_4) AS f16_4, FIRST(pois.f16_6) AS f16_6, FIRST(pois.f17) AS f17, 
  FIRST(pois.f17_2) AS f17_2, FIRST(pois.f17_3) AS f17_3, FIRST(pois.f17_4) AS f17_4, 
  FIRST(pois.f17_5) AS f17_5, FIRST(pois.f19) AS f19, FIRST(pois.f19_1) AS f19_1, 
  FIRST(pois.f19_2) AS f19_2, FIRST(pois.f19_3) AS f19_3, FIRST(pois.f19_4) AS f19_4, 
  FIRST(pois.f1_1) AS f1_1, FIRST(pois.f1_10) AS f1_10, FIRST(pois.f1_11) AS f1_11, 
  FIRST(pois.f1_2) AS f1_2, FIRST(pois.f1_3) AS f1_3, FIRST(pois.f1_4) AS f1_4, 
  FIRST(pois.f1_5) AS f1_5, FIRST(pois.f1_6) AS f1_6, FIRST(pois.f1_7) AS f1_7, 
  FIRST(pois.f1_8) AS f1_8, FIRST(pois.f20) AS f20, FIRST(pois.f20_1) AS f20_1, 
  FIRST(pois.f20_2) AS f20_2, FIRST(pois.f20_4) AS f20_4, FIRST(pois.f20_5) AS f20_5, 
  FIRST(pois.f20_6) AS f20_6, FIRST(pois.f20_7) AS f20_7, FIRST(pois.f20_8) AS f20_8, 
  FIRST(pois.f20_9) AS f20_9, FIRST(pois.f21_1) AS f21_1, FIRST(pois.f21_2) AS f21_2, 
  FIRST(pois.f22) AS f22, FIRST(pois.f22_1) AS f22_1, FIRST(pois.f22_2) AS f22_2, 
  FIRST(pois.f22_3) AS f22_3, FIRST(pois.f22_4) AS f22_4, FIRST(pois.f22_5) AS f22_5, 
  FIRST(pois.f23) AS f23, FIRST(pois.f23_1) AS f23_1, FIRST(pois.f23_2) AS f23_2, 
  FIRST(pois.f23_3) AS f23_3, FIRST(pois.f23_4) AS f23_4, FIRST(pois.f23_5) AS f23_5, 
  FIRST(pois.f23_6) AS f23_6, FIRST(pois.f24) AS f24, FIRST(pois.f24_1) AS f24_1, 
  FIRST(pois.f24_2) AS f24_2, FIRST(pois.f24_3) AS f24_3, FIRST(pois.f25) AS f25, 
  FIRST(pois.f25_1) AS f25_1, FIRST(pois.f25_3) AS f25_3, FIRST(pois.f25_7) AS f25_7, 
  FIRST(pois.f25_8) AS f25_8, FIRST(pois.f25_9) AS f25_9, FIRST(pois.f2_1) AS f2_1, 
  FIRST(pois.f2_10) AS f2_10, FIRST(pois.f2_11) AS f2_11, FIRST(pois.f2_12) AS f2_12, 
  FIRST(pois.f2_13) AS f2_13, FIRST(pois.f2_2) AS f2_2, FIRST(pois.f2_4) AS f2_4, 
  FIRST(pois.f2_5) AS f2_5, FIRST(pois.f2_6) AS f2_6, FIRST(pois.f2_7) AS f2_7, 
  FIRST(pois.f2_8) AS f2_8, FIRST(pois.f3_1) AS f3_1, FIRST(pois.f3_2) AS f3_2, 
  FIRST(pois.f3_3) AS f3_3, FIRST(pois.f4) AS f4, FIRST(pois.f4_1) AS f4_1, 
  FIRST(pois.f4_10) AS f4_10, FIRST(pois.f4_11) AS f4_11, FIRST(pois.f4_13) AS f4_13, 
  FIRST(pois.f4_14) AS f4_14, FIRST(pois.f4_16) AS f4_16, FIRST(pois.f4_17) AS f4_17, 
  FIRST(pois.f4_18) AS f4_18, FIRST(pois.f4_2) AS f4_2, FIRST(pois.f4_3) AS f4_3, 
  FIRST(pois.f4_5) AS f4_5, FIRST(pois.f4_6) AS f4_6, FIRST(pois.f4_7) AS f4_7, 
  FIRST(pois.f4_8) AS f4_8, FIRST(pois.f4_9) AS f4_9, FIRST(pois.f5) AS f5, 
  FIRST(pois.f5_1) AS f5_1, FIRST(pois.f5_3) AS f5_3, FIRST(pois.f5_4) AS f5_4, 
  FIRST(pois.f6) AS f6, FIRST(pois.f6_1) AS f6_1, FIRST(pois.f6_2) AS f6_2, 
  FIRST(pois.f6_4) AS f6_4, FIRST(pois.f7) AS f7, FIRST(pois.f8) AS f8, 
  FIRST(pois.f8_1) AS f8_1, FIRST(pois.f8_2) AS f8_2, FIRST(pois.f8_3) AS f8_3, 
  FIRST(pois.f8_4) AS f8_4, FIRST(pois.f8_5) AS f8_5,
  FIRST(weather.weather) AS weather,
  FIRST(weather.temperature) AS weather_temperature, FIRST(weather.pm25) AS weather_pm25,
  FIRST(traffic.tj_level1) AS traffic_tj_level1, FIRST(traffic.tj_level2) AS traffic_tj_level2,
  FIRST(traffic.tj_level3) AS traffic_tj_level3, FIRST(traffic.tj_level4) AS traffic_tj_level4
FROM [datalab-projects-1331:xjk_algo_comp.future_gaps] as gaps
LEFT JOIN [datalab-projects-1331:xjk_algo_comp.districts] as districts
  ON districts.district_id = gaps.district_id
LEFT JOIN [datalab-projects-1331:xjk_algo_comp.pois] as pois
  ON pois.district_hash = districts.district_hash
LEFT JOIN [datalab-projects-1331:xjk_algo_comp.weather] as weather
  ON weather.timeslot = gaps.timeslot
LEFT JOIN [datalab-projects-1331:xjk_algo_comp.traffic] as traffic
  ON traffic.timeslot = gaps.timeslot
  AND traffic.district_hash = districts.district_hash
GROUP BY gaps.district_id, gaps.timeslot

# PCA
Combine POIs (features starting with 'f...') into n number of features.

In [None]:
pois = """
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	f11_8	f13_4	f13_8	
f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	f15_2	f15_3	f15_4	
f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	f16_4	f16_6	f17	
f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	f1_10	f1_11	
f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	f20_4	f20_5	
f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	f22_4	f22_5	
f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	f25	f25_1	
f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	f2_4	f2_5	
f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	f4_13	f4_14	
f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	f5	f5_1	
f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	f8_5
"""
pois = map(lambda x: "{}".format(x.strip()), pois.split('\t'))
print(pois)

In [20]:
%%sql --module q
SELECT * FROM [datalab-projects-1331:xjk_algo_comp.future_gaps_processed]

In [21]:
query = bq.Query(q)
tableresult = query.results()

poi_fields = ['f1', 'f11', 'f11_1', 'f11_2', 'f11_3', 'f11_4', 'f11_5', 'f11_6', 'f11_7', 'f11_8',
              'f13_4', 'f13_8', 'f14', 'f14_1', 'f14_10', 'f14_2', 'f14_3', 'f14_6', 'f14_8', 'f15',
              'f15_1', 'f15_2', 'f15_3', 'f15_4', 'f15_6', 'f15_7', 'f15_8', 'f16', 'f16_1', 'f16_10',
              'f16_11', 'f16_12', 'f16_3', 'f16_4', 'f16_6', 'f17', 'f17_2', 'f17_3', 'f17_4', 'f17_5',
              'f19', 'f19_1', 'f19_2', 'f19_3', 'f19_4', 'f1_1', 'f1_10', 'f1_11', 'f1_2', 'f1_3', 'f1_4',
              'f1_5', 'f1_6', 'f1_7', 'f1_8', 'f20', 'f20_1', 'f20_2', 'f20_4', 'f20_5', 'f20_6', 'f20_7',
              'f20_8', 'f20_9', 'f21_1', 'f21_2', 'f22', 'f22_1', 'f22_2', 'f22_3', 'f22_4', 'f22_5',
              'f23', 'f23_1', 'f23_2', 'f23_3', 'f23_4', 'f23_5', 'f23_6', 'f24', 'f24_1', 'f24_2',
              'f24_3', 'f25', 'f25_1', 'f25_3', 'f25_7', 'f25_8', 'f25_9', 'f2_1', 'f2_10', 'f2_11',
              'f2_12', 'f2_13', 'f2_2', 'f2_4', 'f2_5', 'f2_6', 'f2_7', 'f2_8', 'f3_1', 'f3_2', 'f3_3',
              'f4', 'f4_1', 'f4_10', 'f4_11', 'f4_13', 'f4_14', 'f4_16', 'f4_17', 'f4_18', 'f4_2',
              'f4_3', 'f4_5', 'f4_6', 'f4_7', 'f4_8', 'f4_9', 'f5', 'f5_1', 'f5_3', 'f5_4', 'f6', 'f6_1',
              'f6_2', 'f6_4', 'f7', 'f8', 'f8_1', 'f8_2', 'f8_3', 'f8_4', 'f8_5']
all_fields = tableresult[0].keys()
all_data = []
pois_data = np.zeros((tableresult.length, len(poi_fields)))
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  for fcounter, field in enumerate(poi_fields):
    pois_data[rcounter, fcounter] = row.pop(field, None)
  all_data.append(row)
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)

there are 163491 rows
processed 0 rows
processed 5000 rows
processed 10000 rows
processed 15000 rows
processed 20000 rows
processed 25000 rows
processed 30000 rows
processed 35000 rows
processed 40000 rows
processed 45000 rows
processed 50000 rows
processed 55000 rows
processed 60000 rows
processed 65000 rows
processed 70000 rows
processed 75000 rows
processed 80000 rows
processed 85000 rows
processed 90000 rows
processed 95000 rows
processed 100000 rows
processed 105000 rows
processed 110000 rows
processed 115000 rows
processed 120000 rows
processed 125000 rows
processed 130000 rows
processed 135000 rows
processed 140000 rows
processed 145000 rows
processed 150000 rows
processed 155000 rows
processed 160000 rows


In [22]:
pca = PCA(n_components=poi_n_range)
pois_data_s = pca.fit_transform(pois_data)

In [25]:
for rcounter, row in enumerate(all_data):
  for index in range(pois_data_s.shape[1]):
    all_data[rcounter]['poi{}'.format(index+1)] = pois_data_s[rcounter, index]

In [27]:
schema = bq.Schema([
    {'name': 'district_id', 'type': 'INTEGER'},
    {'name': 'timeslot', 'type': 'STRING'},
    {'name': 'date', 'type': 'STRING'},
    {'name': 'timeofday_slot', 'type': 'INTEGER'},
    {'name': 'day_in_week', 'type': 'INTEGER'},
    {'name': 'is_sunday', 'type': 'INTEGER'},
    {'name': 'sum_price', 'type': 'FLOAT'},
    {'name': 'avg_price', 'type': 'FLOAT'},
    {'name': 'traffic_tj_level1', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level2', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level3', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level4', 'type': 'INTEGER'},
    {'name': 'weather', 'type': 'INTEGER'},
    {'name': 'weather_pm25', 'type': 'FLOAT'},
    {'name': 'weather_temperature', 'type': 'FLOAT'},
    {'name': 'gap', 'type': 'INTEGER'}
] + poi_schema)
table = bq.Table('datalab-projects-1331:xjk_algo_comp.future_gaps_final1')
if table.exists():
  table.delete()
table.create(schema)
  
table.insert_data(all_data)

district_id,timeslot,date,timeofday_slot,day_in_week,is_sunday,sum_price,avg_price,traffic_tj_level1,traffic_tj_level2,traffic_tj_level3,traffic_tj_level4,weather,weather_pm25,weather_temperature,gap,poi1,poi2,poi3,poi4,poi5,poi6,poi7,poi8,poi9,poi10
31,2016-01-01-132,2016-01-01,132,5,0,131.0,14.5555555556,630.0,60.0,14.0,14.0,2.0,128.0,8.0,4,-55528.6688413,-3665.78773112,121.680839099,-5696.16021106,-1436.15863563,1383.4872069,-4241.30111214,707.76087898,2489.72004366,-1447.51592168
12,2016-01-16-60,2016-01-16,60,6,0,1790.3,17.903,1654.0,313.0,73.0,81.0,8.0,117.0,3.0,2,128675.369122,31288.8410916,-14001.17762,1242.64062714,39493.874894,23091.7194032,7526.82480871,-12527.0442751,-2534.16757512,295.208702939
56,2016-01-20-71,2016-01-20,71,3,0,140.0,20.0,191.0,23.0,20.0,5.0,,,,4,-62654.031676,6028.48480374,-4445.54885145,-10283.3290636,5597.91594742,-709.945447157,730.248044767,1275.14169685,298.169958159,2850.16479881
38,2016-01-14-83,2016-01-14,83,4,0,245.0,24.5,361.0,68.0,20.0,16.0,2.0,125.0,8.0,2,-55675.9716833,197.557492556,1409.85979012,-5526.46183733,-1318.21198254,441.984514419,-1170.48133519,972.374122858,1210.75318309,964.328514639
38,2016-01-19-91,2016-01-19,91,2,0,375.1,34.1,318.0,86.0,11.0,7.0,2.0,103.0,5.0,3,-55675.9716833,197.557492556,1409.85979012,-5526.46183733,-1318.21198254,441.984514419,-1170.48133519,972.374122858,1210.75318309,964.328514639
45,2016-01-12-94,2016-01-12,94,2,0,105.8,35.2666666667,110.0,16.0,0.0,0.0,3.0,64.0,4.0,2,-66735.1264469,-3337.61570672,-5161.29421015,-10325.8106081,2348.68012892,-2748.10563634,-1872.7739758,2975.82511613,586.35026649,1090.32875473
1,2016-01-18-46,2016-01-18,46,1,0,1993.3,14.1368794326,1445.0,327.0,88.0,80.0,2.0,81.0,3.0,1,48245.8900598,-341.806380342,8231.71262987,29193.6731936,4288.37478157,-28909.2613655,-4401.50679455,-5676.78441475,-8417.26375319,-435.32474746
42,2016-01-20-1,2016-01-20,1,3,0,645.0,19.5454545455,,,,,,,,0,5159.04699367,25063.5464052,26556.7762355,14427.9257913,-21064.5834914,1391.51913019,1272.36295249,-10635.0527994,4077.15079438,1679.00450733
10,2016-01-13-58,2016-01-13,58,3,0,150.7,21.5285714286,160.0,14.0,0.0,2.0,2.0,84.0,3.0,4,-67611.826108,-898.594182802,-7635.17717761,-8032.08938774,5210.17826651,-2477.07503048,1930.20698948,929.994145541,-88.2044184257,738.211767452
32,2016-01-02-108,2016-01-02,108,6,0,130.0,18.5714285714,148.0,13.0,0.0,2.0,2.0,85.0,14.0,3,-61194.5683891,-1660.197622,-240.659554095,-7812.03838761,-314.309459331,951.826244726,-2217.22041198,271.075852592,1307.60256577,-446.363405432


# Replace NULL Values

In [28]:
all_data_df = pd.DataFrame(all_data)
all_data_df
all_data_df['traffic_tj_level1'] = all_data_df['traffic_tj_level1'].fillna(0).astype('int64')
all_data_df['traffic_tj_level2'] = all_data_df['traffic_tj_level2'].fillna(0).astype('int64')
all_data_df['traffic_tj_level3'] = all_data_df['traffic_tj_level3'].fillna(0).astype('int64')
all_data_df['traffic_tj_level4'] = all_data_df['traffic_tj_level4'].fillna(0).astype('int64')
all_data_df['weather'] = all_data_df['weather'].fillna(0).astype('int64')

schema = bq.Schema([
    {'name': 'district_id', 'type': 'INTEGER'},
    {'name': 'timeslot', 'type': 'STRING'},
    {'name': 'date', 'type': 'STRING'},
    {'name': 'timeofday_slot', 'type': 'INTEGER'},
    {'name': 'day_in_week', 'type': 'INTEGER'},
    {'name': 'is_sunday', 'type': 'INTEGER'},
    {'name': 'sum_price', 'type': 'FLOAT'},
    {'name': 'avg_price', 'type': 'FLOAT'},
    {'name': 'traffic_tj_level1', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level2', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level3', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level4', 'type': 'INTEGER'},
    {'name': 'weather', 'type': 'INTEGER'},
    {'name': 'weather_pm25', 'type': 'FLOAT'},
    {'name': 'weather_temperature', 'type': 'FLOAT'},
    {'name': 'gap', 'type': 'INTEGER'}
] + poi_schema)
table = bq.Table('datalab-projects-1331:xjk_algo_comp.future_gaps_final2')
if table.exists():
  table.delete()

table.create(schema)
table.insert_data(all_data_df)

district_id,timeslot,date,timeofday_slot,day_in_week,is_sunday,sum_price,avg_price,traffic_tj_level1,traffic_tj_level2,traffic_tj_level3,traffic_tj_level4,weather,weather_pm25,weather_temperature,gap,poi1,poi2,poi3,poi4,poi5,poi6,poi7,poi8,poi9,poi10
41,2016-01-13-73,2016-01-13,73,3,0,307.1,19.19375,263,45,15,1,2,88.0,3.0,4,-63794.1683046,-2381.66430634,-3549.84942555,-7280.20851261,1028.16747064,486.377506507,-634.336137762,799.451079387,409.697477256,-477.19839667
54,2016-01-04-92,2016-01-04,92,1,0,61.0,6.1,0,0,0,0,4,228.0,12.0,0,-48692.7906052,5023.94317324,4011.23332883,-1168.11560868,-7489.8740527,1615.01701483,501.629376494,-527.163601424,1293.42276079,1256.37766564
41,2016-01-04-71,2016-01-04,71,1,0,145.8,16.2,234,44,15,10,4,219.0,13.0,2,-63794.1683046,-2381.66430634,-3549.84942555,-7280.20851261,1028.16747064,486.377506507,-634.336137762,799.451079387,409.697477256,-477.19839667
15,2016-01-04-118,2016-01-04,118,1,0,25.0,25.0,14,0,0,0,8,226.0,12.0,1,-69547.2383967,-1652.83998074,-9158.31516363,-9795.09858722,5035.20467057,-2277.45799791,924.671330688,2018.60640191,579.425384609,1606.08439076
38,2016-01-19-60,2016-01-19,60,2,0,624.1,24.964,319,77,16,7,2,160.0,3.0,18,-55675.9716833,197.557492556,1409.85979012,-5526.46183733,-1318.21198254,441.984514419,-1170.48133519,972.374122858,1210.75318309,964.328514639
52,2016-01-08-122,2016-01-08,122,5,0,20.0,20.0,137,11,4,2,2,113.0,5.0,0,-68969.4479633,-1671.83235164,-7565.07036634,-8988.465006,3964.26299704,-1216.1386887,206.870223155,1574.80176883,1049.50729951,1388.86977149
29,2016-01-03-117,2016-01-03,117,0,1,769.8,19.7384615385,634,290,56,48,2,153.0,15.0,5,-47097.0663665,3307.05605661,7418.35302358,3174.13059945,-3171.53771642,1888.56676221,-3952.83862092,-3341.65383476,3770.59023866,1319.48775781
25,2016-01-19-136,2016-01-19,136,2,0,328.0,19.2941176471,842,131,38,24,2,112.0,2.0,0,-31927.3566485,17027.5306468,8299.89607968,3632.54114853,-1024.61841571,4767.02660797,6389.64024105,6162.75220713,-2138.41574365,166.993277365
28,2016-01-20-44,2016-01-20,44,3,0,1930.4,17.2357142857,1150,291,69,48,1,106.0,1.0,3,16820.5263554,-4039.83206677,-11239.117798,16800.5024686,14677.9847326,-3356.76961407,-2318.76486382,3049.11527692,-338.458103656,-6226.67627464
24,2016-01-06-82,2016-01-06,82,3,0,1200.9,20.015,1001,251,85,43,4,68.0,7.0,3,19523.2410099,4962.96782651,-1374.24953385,16082.7083821,3468.89739271,8432.75509471,1453.92654116,-3655.43865866,1804.03000189,2994.38127998


# Previous Gap
Add `previous_gap` field to data.

In [29]:
%%sql --module q

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp.future_gaps_final2]
ORDER BY timeslot, district_id

In [None]:
query = bq.Query(q)
tableresult = query.results()

previous_gaps = {}
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  previous_gaps['{}:{}'.format(row['timeslot'], row['district_id'])] = row['gap']
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)

there are 163491 rows
processed 0 rows
processed 5000 rows
processed 10000 rows
processed 15000 rows
processed 20000 rows
processed 25000 rows
processed 30000 rows
processed 35000 rows
processed 40000 rows
processed 45000 rows
processed 50000 rows
processed 55000 rows
processed 60000 rows
processed 65000 rows
processed 70000 rows
processed 75000 rows


In [None]:
all_data = []
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  timeslot_r = map(lambda x: int(x), row['timeslot'].split('-'))
  for i in range(prev_n_range):
    prev_n = i+1
    slot = timeslot_r[3] - 1
    date_obj = date(timeslot_r[0], timeslot_r[1], timeslot_r[2])
    if slot <= 0:
      slot = 144
      date_obj = date_obj - timedelta(days=1)
    prev_timestamp = '{}-{}-{}-{}'.format(
      date_obj.year, str(date_obj.month).zfill(2), str(date_obj.day).zfill(2), slot)
    prev_index = '{}:{}'.format(prev_timestamp, row['district_id'])
    row['previous_gap{}'.format(prev_n)] = 0
    if prev_index in previous_gaps:
      row['previous_gap{}'.format(prev_n)] = previous_gaps[prev_index]
  all_data.append(row)
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)

In [13]:
schema = bq.Schema([
    {'name': 'district_id', 'type': 'INTEGER'},
    {'name': 'timeslot', 'type': 'STRING'},
    {'name': 'date', 'type': 'STRING'},
    {'name': 'timeofday_slot', 'type': 'INTEGER'},
    {'name': 'day_in_week', 'type': 'INTEGER'},
    {'name': 'is_sunday', 'type': 'INTEGER'},
    {'name': 'sum_price', 'type': 'FLOAT'},
    {'name': 'avg_price', 'type': 'FLOAT'},
    {'name': 'traffic_tj_level1', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level2', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level3', 'type': 'INTEGER'},
    {'name': 'traffic_tj_level4', 'type': 'INTEGER'},
    {'name': 'weather', 'type': 'INTEGER'},
    {'name': 'weather_pm25', 'type': 'FLOAT'},
    {'name': 'weather_temperature', 'type': 'FLOAT'},
    {'name': 'gap', 'type': 'INTEGER'}
] + poi_schema + prev_gap_schema)
table = bq.Table('datalab-projects-1331:xjk_algo_comp.future_gaps_final3')
if table.exists():
  table.delete()
table.create(schema)
table.insert_data(all_data)

district_id,timeslot,date,timeofday_slot,day_in_week,is_sunday,sum_price,avg_price,traffic_tj_level1,traffic_tj_level2,traffic_tj_level3,traffic_tj_level4,weather,weather_pm25,weather_temperature,gap,poi1,poi2,poi3,poi4,poi5,poi6,poi7,poi8,poi9,poi10,previous_gap1,previous_gap2,previous_gap3,previous_gap4,previous_gap5
42,2016-01-04-16,2016-01-04,16,1,0,414.0,29.5714285714,1006,87,26,25,2,202.0,13.0,1,5159.04699367,25063.5464052,26556.7762355,14427.9257913,-21064.5834914,1391.51913019,1272.36295249,-10635.0527994,4077.15079438,1679.00450733,0,0,0,0,0
11,2016-01-04-93,2016-01-04,93,1,0,196.5,16.375,721,209,67,38,4,228.0,12.0,0,-33769.1898693,-1151.34828854,18533.2250738,6789.14501789,-23522.0351577,7287.98304105,-1428.28651003,-1332.11359029,3062.1052634,822.533561102,0,0,0,0,0
48,2016-01-08-117,2016-01-08,117,5,0,3686.3,16.0273913043,2305,779,196,142,2,111.0,5.0,12,303528.02255,14673.1668546,-108784.562531,-13890.9635498,-32547.7300861,-2243.11996941,2666.3170299,-5703.47445931,-4761.85534916,-1191.15386499,0,0,0,0,0
42,2016-01-17-139,2016-01-17,139,0,1,1034.1,18.4660714286,1299,292,45,60,3,96.0,3.0,1,5159.04699367,25063.5464052,26556.7762355,14427.9257913,-21064.5834914,1391.51913019,1272.36295249,-10635.0527994,4077.15079438,1679.00450733,0,0,0,0,0
51,2016-01-06-36,2016-01-06,36,3,0,900.0,29.0322580645,3349,400,69,83,4,46.0,7.0,7,708222.866131,56252.6432525,45318.9443189,-31516.3520766,6643.82498022,-6255.77998574,-5874.62782275,5230.07088296,960.212337132,925.726201223,4,4,4,4,4
33,2016-01-01-63,2016-01-01,63,5,0,298.0,18.625,213,61,16,18,2,186.0,5.0,7,-57374.1656061,3331.11578506,-2333.90366109,-3774.5385603,761.532298242,640.505156553,3157.74923423,2164.46583818,929.502327117,-422.925590902,0,0,0,0,0
11,2016-01-12-58,2016-01-12,58,2,0,540.0,20.7692307692,767,159,53,27,2,47.0,4.0,0,-33769.1898693,-1151.34828854,18533.2250738,6789.14501789,-23522.0351577,7287.98304105,-1428.28651003,-1332.11359029,3062.1052634,822.533561102,0,0,0,0,0
32,2016-01-15-49,2016-01-15,49,5,0,56.0,18.6666666667,165,19,8,1,2,151.0,4.0,0,-61194.5683891,-1660.197622,-240.659554095,-7812.03838761,-314.309459331,951.826244726,-2217.22041198,271.075852592,1307.60256577,-446.363405432,0,0,0,0,0
50,2016-01-15-91,2016-01-15,91,5,0,55.0,55.0,184,17,15,0,2,84.0,11.0,0,-61876.0995581,-1357.49437049,-2077.24967252,-7257.02419144,794.360329292,1241.74877553,-535.387910289,2506.10742456,448.09618372,280.719551554,0,0,0,0,0
59,2016-01-04-75,2016-01-04,75,1,0,29.0,9.66666666667,148,15,4,2,4,219.0,12.0,1,-64364.6353441,-1598.17517753,-6410.65516565,-7173.08461937,2429.5624518,-1025.4069752,1553.93575592,1311.01084431,1011.70315328,-51.4614331857,0,0,0,0,0
