In [1]:
import gcp.storage as storage
import gcp.bigquery as bq
from datetime import date
import pandas as pd
import numpy as np
from datetime import date
from datetime import timedelta
import os
import pdb
prev_n_range = 3

def calc_past_timeslot(timeslot, slots):
  timeslot_r = map(lambda x: int(x), timeslot.split('-'))
  slot = timeslot_r[3] - slots
  date_obj = date(timeslot_r[0], timeslot_r[1], timeslot_r[2])
  if slot <= 0:
    slot = 144
    date_obj = date_obj - timedelta(days=1)
  return '{}-{}-{}-{}'.format(
    date_obj.year, str(date_obj.month).zfill(2), str(date_obj.day).zfill(2), slot)

# Gaps Table Creation for Training
First, here is an overview of `timeslot` field in **orders** table:

In [2]:
%%sql --module q_timeslots

SELECT FIRST(timeslot) AS timeslot 
FROM [datalab-projects-1331:xjk_algo_comp.orders] AS orders
GROUP BY date, timeofday_slot
ORDER BY date, timeofday_slot

# Should be the same with GROUP BY timeslot

In [3]:
timeslots_df = bq.Query(q_timeslots).to_dataframe()
print('number of data points: {}'.format(len(timeslots_df)))
print('first 5:')
timeslots_df.head(5)

number of data points: 3024
first 5:


Unnamed: 0,timeslot
0,2016-01-01-1
1,2016-01-01-2
2,2016-01-01-3
3,2016-01-01-4
4,2016-01-01-5


In [4]:
print ('last 5:')
timeslots_df.tail(5)

last 5:


Unnamed: 0,timeslot
3019,2016-01-21-140
3020,2016-01-21-141
3021,2016-01-21-142
3022,2016-01-21-143
3023,2016-01-21-144


In **orders** table, there are 3024 data points when grouped by `timeslot`. Since the data starts from date 2016-01-01, that means for a consecutive 21 days (until date 2016-01-21) all 144 timeslots in each day have orders.

Therefore, we can use 3024 - `prev_n_range`(number of previous slots to use in prediction) data as our training data (for example, if we use 3 days for prediction, there are 3021 training data, starting from slot 2016-01-04, where we use data from 2016-01-01, 2016-01-02, and 2016-01-03 as features).

In [5]:
# Prepare columns for gaps table creation

pois_table = bq.Table('datalab-projects-1331:xjk_algo_comp.pois')
pois_fields = [item['name'] for item in pois_table.schema][1:]
traffic_fields = []
weather_fields = []
orders_fields = []
for i in range(prev_n_range):
  prev_n = i+1
  traffic_fields.append('tj_level1_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level2_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level3_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level4_{}_slots_ago'.format(prev_n))
  weather_fields.append('weather_{}_slots_ago'.format(prev_n))
  weather_fields.append('temperature_{}_slots_ago'.format(prev_n))
  weather_fields.append('pm25_{}_slots_ago'.format(prev_n))
  orders_fields.append('gap_{}_slots_ago'.format(prev_n))
  orders_fields.append('sum_price_{}_slots_ago'.format(prev_n))
columns = ['district_id', 'timeslot', 'day_in_week', 'timeofday_slot'] + \
  traffic_fields + weather_fields + orders_fields + pois_fields + ['gap']

First, get all timeslots to test.

In [6]:
all_timeslots = timeslots_df['timeslot'].tolist()[prev_n_range:]
tquery = ','.join(map(lambda x: "'{}'".format(x.strip()), all_timeslots))
print(tquery)

'2016-01-01-4','2016-01-01-5','2016-01-01-6','2016-01-01-7','2016-01-01-8','2016-01-01-9','2016-01-01-10','2016-01-01-11','2016-01-01-12','2016-01-01-13','2016-01-01-14','2016-01-01-15','2016-01-01-16','2016-01-01-17','2016-01-01-18','2016-01-01-19','2016-01-01-20','2016-01-01-21','2016-01-01-22','2016-01-01-23','2016-01-01-24','2016-01-01-25','2016-01-01-26','2016-01-01-27','2016-01-01-28','2016-01-01-29','2016-01-01-30','2016-01-01-31','2016-01-01-32','2016-01-01-33','2016-01-01-34','2016-01-01-35','2016-01-01-36','2016-01-01-37','2016-01-01-38','2016-01-01-39','2016-01-01-40','2016-01-01-41','2016-01-01-42','2016-01-01-43','2016-01-01-44','2016-01-01-45','2016-01-01-46','2016-01-01-47','2016-01-01-48','2016-01-01-49','2016-01-01-50','2016-01-01-51','2016-01-01-52','2016-01-01-53','2016-01-01-54','2016-01-01-55','2016-01-01-56','2016-01-01-57','2016-01-01-58','2016-01-01-59','2016-01-01-60','2016-01-01-61','2016-01-01-62','2016-01-01-63','2016-01-01-64','2016-01-01-65','2016-01-01-66

From each timeslot, get three previous timeslots.

In [7]:
prev_timeslots_flat = []
prev_timeslots_dict = {}
for timeslot in all_timeslots:
  prev_timeslots_dict[timeslot] = []
  for i in range(prev_n_range+1):
    prev_n = i
    prev_timeslot = calc_past_timeslot(timeslot, prev_n)
    prev_timeslots_flat.append(prev_timeslot)
    prev_timeslots_dict[timeslot].append(prev_timeslot)
used = []
prev_timeslots_flat = [x for x in prev_timeslots_flat if x not in used and (used.append(x) or True)]

print prev_timeslots_flat

['2016-01-01-4', '2016-01-01-3', '2016-01-01-2', '2016-01-01-1', '2016-01-01-5', '2016-01-01-6', '2016-01-01-7', '2016-01-01-8', '2016-01-01-9', '2016-01-01-10', '2016-01-01-11', '2016-01-01-12', '2016-01-01-13', '2016-01-01-14', '2016-01-01-15', '2016-01-01-16', '2016-01-01-17', '2016-01-01-18', '2016-01-01-19', '2016-01-01-20', '2016-01-01-21', '2016-01-01-22', '2016-01-01-23', '2016-01-01-24', '2016-01-01-25', '2016-01-01-26', '2016-01-01-27', '2016-01-01-28', '2016-01-01-29', '2016-01-01-30', '2016-01-01-31', '2016-01-01-32', '2016-01-01-33', '2016-01-01-34', '2016-01-01-35', '2016-01-01-36', '2016-01-01-37', '2016-01-01-38', '2016-01-01-39', '2016-01-01-40', '2016-01-01-41', '2016-01-01-42', '2016-01-01-43', '2016-01-01-44', '2016-01-01-45', '2016-01-01-46', '2016-01-01-47', '2016-01-01-48', '2016-01-01-49', '2016-01-01-50', '2016-01-01-51', '2016-01-01-52', '2016-01-01-53', '2016-01-01-54', '2016-01-01-55', '2016-01-01-56', '2016-01-01-57', '2016-01-01-58', '2016-01-01-59', '2016

Load all required data from previous timeslots from database.

## Traffic

In [8]:
%%sql --module q_traffic
SELECT district_id, timeslot, tj_level1, tj_level2, tj_level3, tj_level4
FROM xjk_algo_comp.traffic AS traffic
JOIN xjk_algo_comp.districts AS districts
  ON traffic.district_hash = districts.district_hash
WHERE timeslot IN $timeslots

In [9]:
traffic_df = bq.Query(q_traffic, timeslots=prev_timeslots_flat).to_dataframe()

## Weather

In [10]:
%%sql --module q_weather
SELECT * FROM xjk_algo_comp.weather
WHERE timeslot IN $timeslots

In [11]:
weather_df = bq.Query(q_weather, timeslots=prev_timeslots_flat).to_dataframe()

## Gaps from Orders

In [12]:
%%sql --module q_order_gaps

SELECT district_id, FIRST(orders.timeslot) AS timeslot, FIRST(orders.date) AS date,
  FIRST(day_in_week) AS day_in_week, FIRST(timeofday_slot) AS timeofday_slot,
  SUM(price) AS sum_price, AVG(price) AS avg_price,
  IF(FIRST(timeofday_slot) >= 50 AND FIRST(timeofday_slot) <= 53, 1, 0) AS busy_time,
  SUM(IF(driver_id = 'NULL', 1, 0)) AS gap
FROM [datalab-projects-1331:xjk_algo_comp.orders] AS orders
JOIN [datalab-projects-1331:xjk_algo_comp.districts] AS districts 
  ON orders.start_district_hash = districts.district_hash
WHERE timeslot IN $timeslots
GROUP BY district_id, orders.timeslot

In [13]:
order_gaps_df = bq.Query(q_order_gaps, timeslots=prev_timeslots_flat).to_dataframe()

## POIs

In [14]:
text = ''
for counter, poi in enumerate(pois_fields):
  text += '{}, '.format(poi)
  if counter % 13 == 0 and counter > 0:
    text += '\n'
print text.strip()[:len(text.strip())-1]

f1, f11, f11_1, f11_2, f11_3, f11_4, f11_5, f11_6, f11_7, f11_8, f13_4, f13_8, f14, f14_1, 
f14_10, f14_2, f14_3, f14_6, f14_8, f15, f15_1, f15_2, f15_3, f15_4, f15_6, f15_7, f15_8, 
f16, f16_1, f16_10, f16_11, f16_12, f16_3, f16_4, f16_6, f17, f17_2, f17_3, f17_4, f17_5, 
f19, f19_1, f19_2, f19_3, f19_4, f1_1, f1_10, f1_11, f1_2, f1_3, f1_4, f1_5, f1_6, 
f1_7, f1_8, f20, f20_1, f20_2, f20_4, f20_5, f20_6, f20_7, f20_8, f20_9, f21_1, f21_2, 
f22, f22_1, f22_2, f22_3, f22_4, f22_5, f23, f23_1, f23_2, f23_3, f23_4, f23_5, f23_6, 
f24, f24_1, f24_2, f24_3, f25, f25_1, f25_3, f25_7, f25_8, f25_9, f2_1, f2_10, f2_11, 
f2_12, f2_13, f2_2, f2_4, f2_5, f2_6, f2_7, f2_8, f3_1, f3_2, f3_3, f4, f4_1, 
f4_10, f4_11, f4_13, f4_14, f4_16, f4_17, f4_18, f4_2, f4_3, f4_5, f4_6, f4_7, f4_8, 
f4_9, f5, f5_1, f5_3, f5_4, f6, f6_1, f6_2, f6_4, f7, f8, f8_1, f8_2, 
f8_3, f8_4, f8_5


In [15]:
%%sql --module q_pois
SELECT district_id,
f1, f11, f11_1, f11_2, f11_3, f11_4, f11_5, f11_6, f11_7, f11_8, f13_4, f13_8, f14, f14_1, 
f14_10, f14_2, f14_3, f14_6, f14_8, f15, f15_1, f15_2, f15_3, f15_4, f15_6, f15_7, f15_8, 
f16, f16_1, f16_10, f16_11, f16_12, f16_3, f16_4, f16_6, f17, f17_2, f17_3, f17_4, f17_5, 
f19, f19_1, f19_2, f19_3, f19_4, f1_1, f1_10, f1_11, f1_2, f1_3, f1_4, f1_5, f1_6, 
f1_7, f1_8, f20, f20_1, f20_2, f20_4, f20_5, f20_6, f20_7, f20_8, f20_9, f21_1, f21_2, 
f22, f22_1, f22_2, f22_3, f22_4, f22_5, f23, f23_1, f23_2, f23_3, f23_4, f23_5, f23_6, 
f24, f24_1, f24_2, f24_3, f25, f25_1, f25_3, f25_7, f25_8, f25_9, f2_1, f2_10, f2_11, 
f2_12, f2_13, f2_2, f2_4, f2_5, f2_6, f2_7, f2_8, f3_1, f3_2, f3_3, f4, f4_1, 
f4_10, f4_11, f4_13, f4_14, f4_16, f4_17, f4_18, f4_2, f4_3, f4_5, f4_6, f4_7, f4_8, 
f4_9, f5, f5_1, f5_3, f5_4, f6, f6_1, f6_2, f6_4, f7, f8, f8_1, f8_2, 
f8_3, f8_4, f8_5
FROM xjk_algo_comp.pois AS pois
JOIN xjk_algo_comp.districts AS districts ON districts.district_hash = pois.district_hash

In [16]:
pois_df = bq.Query(q_pois).to_dataframe()

Create **gaps** table. Each row of this table should contain all features from three previous timeslots.

In [17]:
class Batch:
  def __init__(self):
    self.district_ids = []
    self.order_gaps_df = None
    self.pois_df = None
    self.pois_fields = []
    self.prev_n_range = 0
    self.traffic_df = None
    self.weather_df = None
    self.prev_timeslots_dict = {}

  def process(self, all_timeslots):
    counter = 0
    gaps_df = pd.DataFrame(columns=columns)
    for timeslot in all_timeslots:
      print 'Now processing timeslot {}'.format(timeslot)
      for district_id in self.district_ids:
        timeslot_r = map(lambda x: int(x), timeslot.split('-'))
        gap_data = {'timeslot': timeslot,
                    'day_in_week': date(timeslot_r[0], timeslot_r[1], timeslot_r[2]).weekday(),
                    'timeofday_slot': timeslot_r[3],
                    'district_id': int(district_id),
                    'gap': int(0)
                   }
        poi_row = self.pois_df.loc[
          (self.pois_df['district_id'] == district_id)
        ]
        for poi_field in self.pois_fields:
          gap_data[poi_field] = poi_row.iloc[0][poi_field]
        prev_gaps = self.order_gaps_df.loc[
          (self.order_gaps_df['timeslot'].isin(self.prev_timeslots_dict[timeslot])) &
          (self.order_gaps_df['district_id'] == district_id)
        ]

        for i in range(self.prev_n_range+1):
          slot_diff = i
          past_timeslot = calc_past_timeslot(timeslot, slot_diff)
          prev_gap_loc = prev_gaps.loc[
            prev_gaps['timeslot'] == past_timeslot
          ]
          if not prev_gap_loc.empty:
            prev_gap = prev_gap_loc.iloc[0]
            if prev_gap['timeslot'] == timeslot:
              gap_data['gap'] = int(prev_gap['gap'])
            else:
              gap_data['gap_{}_slots_ago'.format(slot_diff)] = int(prev_gap['gap'])
              gap_data['sum_price_{}_slots_ago'.format(slot_diff)] = prev_gap['sum_price']

          if slot_diff > 0:
            # There is a possibility that weather or traffic for previous
            # slot does not get recorded, which means we need to get
            # average weather conditions for other existing slots.

            traffic_row = self.traffic_df.loc[
              (self.traffic_df['timeslot'] == past_timeslot) &
              (self.traffic_df['district_id'] == district_id)
            ]
            if not traffic_row.empty:
              gap_data['tj_level1_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level1']
              gap_data['tj_level2_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level2']
              gap_data['tj_level3_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level3']
              gap_data['tj_level4_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level4']
            else:
              # Traffic row is empty, lets get average of other three traffic rows
              traffic_rows = self.traffic_df.loc[
                (self.traffic_df['timeslot'].isin(self.prev_timeslots_dict[timeslot])) &
                (self.traffic_df['district_id'] == district_id)
              ]
              gap_data['tj_level1_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level1'].mean()
              gap_data['tj_level2_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level2'].mean()
              gap_data['tj_level3_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level3'].mean()
              gap_data['tj_level4_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level4'].mean()

            weather_row = self.weather_df.loc[
              (self.weather_df['timeslot']==past_timeslot)
            ]
            if not weather_row.empty:
              gap_data['weather_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['weather']
              gap_data['temperature_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['temperature']
              gap_data['pm25_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['pm25']
            else:
              # Weather row is empty, lets get average of other three weather rows
              weather_rows = self.weather_df.loc[
                (self.weather_df['timeslot'].isin(self.prev_timeslots_dict[timeslot]))
              ]
              gap_data['weather_{}_slots_ago'.format(slot_diff)] = weather_rows['weather'].mean()
              gap_data['temperature_{}_slots_ago'.format(slot_diff)] = weather_rows['temperature'].mean()
              gap_data['pm25_{}_slots_ago'.format(slot_diff)] = weather_rows['pm25'].mean()

        gaps_df = gaps_df.append(gap_data, ignore_index=True)
        counter += 1
        if counter % 100 == 0:
          print 'created {} rows'.format(counter)
    gaps_df['district_id'] = gaps_df['district_id'].astype('int64')
    return gaps_df

In [18]:
%%sql --module q_districts

SELECT district_id
FROM xjk_algo_comp.districts
ORDER BY district_id

In [None]:
# Initialize batches
batch_size = 10
config_file = 'batch_pos.txt'

district_ids = [item['district_id'] for item in bq.Query(q_districts).results()]

if not os.path.isfile(config_file):
  with open(config_file, 'wb') as f:
    f.write('0')

In [None]:
with open(config_file, 'rb') as f:
  batch_pos = int(f.read())
  
total_batches = len(all_timeslots) / batch_size
leftover = (len(all_timeslots) - batch_pos) % batch_size

gaps_table = bq.Table('datalab-projects-1331:xjk_algo_comp.gaps')
if not gaps_table.exists():
  schema = []
  for index, column in enumerate(columns):
    if index == 0:
      schema.append({'name': column, 'type': 'INTEGER'})
    elif index == 1:
      schema.append({'name': column, 'type': 'STRING'})
    else:
      schema.append({'name': column, 'type': 'FLOAT'})
  gaps_table.create(bq.Schema(schema))

batch = Batch()
batch.district_ids = district_ids
batch.order_gaps_df = order_gaps_df
batch.pois_df = pois_df
batch.pois_fields = pois_fields
batch.prev_n_range = prev_n_range
batch.traffic_df = traffic_df
batch.weather_df = weather_df
batch.prev_timeslots_dict = prev_timeslots_dict

for i in range(batch_pos, total_batches+1):
  print "Starting batch pos: {}".format(i)
  from_pos = i*batch_size
  to_pos = (i+1)*batch_size
  if to_pos >= len(all_timeslots):
    to_pos = len(all_timeslots)
  timeslots_to_process = all_timeslots[from_pos:to_pos]
  print 'Writing timeslots from {}({}) to {}({}) ({} rows)'.format(
    timeslots_to_process[0],
    from_pos,
    timeslots_to_process[len(timeslots_to_process)-1],
    to_pos,
    (to_pos-from_pos)*len(district_ids)
  )
  gaps_df = batch.process(timeslots_to_process)
  gaps_table.insert_data(gaps_df)
  print '{} rows committed into database'.format(len(gaps_df))
  with open(config_file, 'wb') as f:
    f.write(str(i+1))

Starting batch pos: 131
Writing timeslots from 2016-01-10-18(1310) to 2016-01-10-27(1320) (660 rows)
Now processing timeslot 2016-01-10-18
Now processing timeslot 2016-01-10-19
created 100 rows
Now processing timeslot 2016-01-10-20
Now processing timeslot 2016-01-10-21
created 200 rows
Now processing timeslot 2016-01-10-22
created 300 rows
Now processing timeslot 2016-01-10-23
Now processing timeslot 2016-01-10-24
created 400 rows
Now processing timeslot 2016-01-10-25
created 500 rows
Now processing timeslot 2016-01-10-26
Now processing timeslot 2016-01-10-27
created 600 rows
660 rows committed into database
Starting batch pos: 132
Writing timeslots from 2016-01-10-28(1320) to 2016-01-10-37(1330) (660 rows)
Now processing timeslot 2016-01-10-28
Now processing timeslot 2016-01-10-29
created 100 rows
Now processing timeslot 2016-01-10-30
Now processing timeslot 2016-01-10-31
created 200 rows
Now processing timeslot 2016-01-10-32
created 300 rows
Now processing timeslot 2016-01-10-33
Now 