In [10]:
import gcp.storage as storage
import gcp.bigquery as bq
from datetime import date
import pandas as pd
import numpy as np
from datetime import date
from datetime import timedelta
import pdb
prev_n_range = 3

def calc_past_timeslot(timeslot, slots):
  timeslot_r = map(lambda x: int(x), timeslot.split('-'))
  slot = timeslot_r[3] - slots
  date_obj = date(timeslot_r[0], timeslot_r[1], timeslot_r[2])
  if slot <= 0:
    slot = 144
    date_obj = date_obj - timedelta(days=1)
  return '{}-{}-{}-{}'.format(
    date_obj.year, str(date_obj.month).zfill(2), str(date_obj.day).zfill(2), slot)

# Gaps Table Creation
I learned the hard way that in Machine Learning competition it is better to start defining the requirements from the test data. Aside of it being much smaller than training data, there are usually also some hints to what the final algorithm should be.

In this project, for example, by looking at, say `traffic` data, it appears that we are expected to predict gaps from 3 previous timeslots, as these rows are the only ones that are available. Let's wrangle the data accordingly.

In [11]:
pois_table = bq.Table('datalab-projects-1331:xjk_algo_comp_test.pois')
pois_fields = [item['name'] for item in pois_table.schema][1:]
traffic_fields = []
weather_fields = []
orders_fields = []
for i in range(prev_n_range):
  prev_n = i+1
  traffic_fields.append('tj_level1_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level2_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level3_{}_slots_ago'.format(prev_n))
  traffic_fields.append('tj_level4_{}_slots_ago'.format(prev_n))
  weather_fields.append('weather_{}_slots_ago'.format(prev_n))
  weather_fields.append('temperature_{}_slots_ago'.format(prev_n))
  weather_fields.append('pm25_{}_slots_ago'.format(prev_n))
  orders_fields.append('gap_{}_slots_ago'.format(prev_n))
  orders_fields.append('sum_price_{}_slots_ago'.format(prev_n))
columns = ['district_id', 'timeslot', 'day_in_week', 'timeofday_slot'] + \
  traffic_fields + weather_fields + orders_fields + pois_fields + ['gap']

First, get all timeslots to test.

In [12]:
# Get timeslots to test from GCS
item = storage.Item('datalab-projects-1331-datalab','data/timeslots_to_test2.txt')
all_timeslots = map(lambda x: x.strip(), item.read_from().strip().split('\n'))
tquery = ','.join(map(lambda x: "'{}'".format(x.strip()), all_timeslots))
print(tquery)

'2016-01-23-46','2016-01-23-58','2016-01-23-70','2016-01-23-82','2016-01-23-94','2016-01-23-106','2016-01-23-118','2016-01-23-130','2016-01-23-142','2016-01-25-58','2016-01-25-70','2016-01-25-82','2016-01-25-94','2016-01-25-106','2016-01-25-118','2016-01-25-130','2016-01-25-142','2016-01-27-46','2016-01-27-58','2016-01-27-70','2016-01-27-82','2016-01-27-94','2016-01-27-106','2016-01-27-118','2016-01-27-130','2016-01-27-142','2016-01-29-58','2016-01-29-70','2016-01-29-82','2016-01-29-94','2016-01-29-106','2016-01-29-118','2016-01-29-130','2016-01-29-142','2016-01-31-46','2016-01-31-58','2016-01-31-70','2016-01-31-82','2016-01-31-94','2016-01-31-106','2016-01-31-118','2016-01-31-130','2016-01-31-142'


From each timeslot, get three previous timeslots.

In [13]:
prev_timeslots_flat = []
prev_timeslots_dict = {}
for timeslot in all_timeslots:
  prev_timeslots_dict[timeslot] = []
  for i in range(prev_n_range+1):
    prev_n = i
    prev_timeslot = calc_past_timeslot(timeslot, prev_n)
    prev_timeslots_flat.append(prev_timeslot)
    prev_timeslots_dict[timeslot].append(prev_timeslot)
used = []
prev_timeslots_flat = [x for x in prev_timeslots_flat if x not in used and (used.append(x) or True)]

print prev_timeslots_flat

['2016-01-23-46', '2016-01-23-45', '2016-01-23-44', '2016-01-23-43', '2016-01-23-58', '2016-01-23-57', '2016-01-23-56', '2016-01-23-55', '2016-01-23-70', '2016-01-23-69', '2016-01-23-68', '2016-01-23-67', '2016-01-23-82', '2016-01-23-81', '2016-01-23-80', '2016-01-23-79', '2016-01-23-94', '2016-01-23-93', '2016-01-23-92', '2016-01-23-91', '2016-01-23-106', '2016-01-23-105', '2016-01-23-104', '2016-01-23-103', '2016-01-23-118', '2016-01-23-117', '2016-01-23-116', '2016-01-23-115', '2016-01-23-130', '2016-01-23-129', '2016-01-23-128', '2016-01-23-127', '2016-01-23-142', '2016-01-23-141', '2016-01-23-140', '2016-01-23-139', '2016-01-25-58', '2016-01-25-57', '2016-01-25-56', '2016-01-25-55', '2016-01-25-70', '2016-01-25-69', '2016-01-25-68', '2016-01-25-67', '2016-01-25-82', '2016-01-25-81', '2016-01-25-80', '2016-01-25-79', '2016-01-25-94', '2016-01-25-93', '2016-01-25-92', '2016-01-25-91', '2016-01-25-106', '2016-01-25-105', '2016-01-25-104', '2016-01-25-103', '2016-01-25-118', '2016-01-

Load all required data from previous timeslots from database.

## Traffic

In [14]:
%%sql --module q_traffic
SELECT district_id, timeslot, tj_level1, tj_level2, tj_level3, tj_level4
FROM xjk_algo_comp_test.traffic AS traffic
JOIN xjk_algo_comp_test.districts AS districts
  ON traffic.district_hash = districts.district_hash
WHERE timeslot IN $timeslots

In [15]:
traffic_df = bq.Query(q_traffic, timeslots=prev_timeslots_flat).to_dataframe()

## Weather

In [16]:
%%sql --module q_weather
SELECT * FROM xjk_algo_comp_test.weather
WHERE timeslot IN $timeslots

In [17]:
weather_df = bq.Query(q_weather, timeslots=prev_timeslots_flat).to_dataframe()

## Gaps from Orders

In [18]:
%%sql --module q_order_gaps

SELECT district_id, FIRST(orders.timeslot) AS timeslot, FIRST(orders.date) AS date,
  FIRST(day_in_week) AS day_in_week, FIRST(timeofday_slot) AS timeofday_slot,
  SUM(price) AS sum_price, AVG(price) AS avg_price,
  IF(FIRST(timeofday_slot) >= 50 AND FIRST(timeofday_slot) <= 53, 1, 0) AS busy_time,
  SUM(IF(driver_id = 'NULL', 1, 0)) AS gap
FROM [datalab-projects-1331:xjk_algo_comp_test.orders] AS orders
JOIN [datalab-projects-1331:xjk_algo_comp_test.districts] AS districts 
  ON orders.start_district_hash = districts.district_hash
WHERE timeslot IN $timeslots
GROUP BY district_id, orders.timeslot

In [19]:
order_gaps_df = bq.Query(q_order_gaps, timeslots=prev_timeslots_flat).to_dataframe()

## POIs

In [20]:
text = ''
for counter, poi in enumerate(pois_fields):
  text += '{}, '.format(poi)
  if counter % 13 == 0 and counter > 0:
    text += '\n'
print text.strip()[:len(text.strip())-1]

f1, f11, f11_1, f11_2, f11_3, f11_4, f11_5, f11_6, f11_7, f11_8, f13_4, f13_8, f14, f14_1, 
f14_10, f14_2, f14_3, f14_6, f14_8, f15, f15_1, f15_2, f15_3, f15_4, f15_6, f15_7, f15_8, 
f16, f16_1, f16_10, f16_11, f16_12, f16_3, f16_4, f16_6, f17, f17_2, f17_3, f17_4, f17_5, 
f19, f19_1, f19_2, f19_3, f19_4, f1_1, f1_10, f1_11, f1_2, f1_3, f1_4, f1_5, f1_6, 
f1_7, f1_8, f20, f20_1, f20_2, f20_4, f20_5, f20_6, f20_7, f20_8, f20_9, f21_1, f21_2, 
f22, f22_1, f22_2, f22_3, f22_4, f22_5, f23, f23_1, f23_2, f23_3, f23_4, f23_5, f23_6, 
f24, f24_1, f24_2, f24_3, f25, f25_1, f25_3, f25_7, f25_8, f25_9, f2_1, f2_10, f2_11, 
f2_12, f2_13, f2_2, f2_4, f2_5, f2_6, f2_7, f2_8, f3_1, f3_2, f3_3, f4, f4_1, 
f4_10, f4_11, f4_13, f4_14, f4_16, f4_17, f4_18, f4_2, f4_3, f4_5, f4_6, f4_7, f4_8, 
f4_9, f5, f5_1, f5_3, f5_4, f6, f6_1, f6_2, f6_4, f7, f8, f8_1, f8_2, 
f8_3, f8_4, f8_5


In [21]:
%%sql --module q_pois
SELECT district_id,
f1, f11, f11_1, f11_2, f11_3, f11_4, f11_5, f11_6, f11_7, f11_8, f13_4, f13_8, f14, f14_1, 
f14_10, f14_2, f14_3, f14_6, f14_8, f15, f15_1, f15_2, f15_3, f15_4, f15_6, f15_7, f15_8, 
f16, f16_1, f16_10, f16_11, f16_12, f16_3, f16_4, f16_6, f17, f17_2, f17_3, f17_4, f17_5, 
f19, f19_1, f19_2, f19_3, f19_4, f1_1, f1_10, f1_11, f1_2, f1_3, f1_4, f1_5, f1_6, 
f1_7, f1_8, f20, f20_1, f20_2, f20_4, f20_5, f20_6, f20_7, f20_8, f20_9, f21_1, f21_2, 
f22, f22_1, f22_2, f22_3, f22_4, f22_5, f23, f23_1, f23_2, f23_3, f23_4, f23_5, f23_6, 
f24, f24_1, f24_2, f24_3, f25, f25_1, f25_3, f25_7, f25_8, f25_9, f2_1, f2_10, f2_11, 
f2_12, f2_13, f2_2, f2_4, f2_5, f2_6, f2_7, f2_8, f3_1, f3_2, f3_3, f4, f4_1, 
f4_10, f4_11, f4_13, f4_14, f4_16, f4_17, f4_18, f4_2, f4_3, f4_5, f4_6, f4_7, f4_8, 
f4_9, f5, f5_1, f5_3, f5_4, f6, f6_1, f6_2, f6_4, f7, f8, f8_1, f8_2, 
f8_3, f8_4, f8_5
FROM xjk_algo_comp_test.pois AS pois
JOIN xjk_algo_comp_test.districts AS districts ON districts.district_hash = pois.district_hash

In [22]:
pois_df = bq.Query(q_pois).to_dataframe()

Create **gaps** table. Each row of this table should contain all features from three previous timeslots.

In [None]:
%%sql --module q_districts

SELECT district_id
FROM xjk_algo_comp_test.districts
ORDER BY district_id

In [None]:
# When counter passes this limit, loads data to table and then resets dataframe.
batch = 5000
counter = 0

gaps_df = pd.DataFrame(columns=columns)
district_ids = [item['district_id'] for item in bq.Query(q_districts).results()]

for timeslot in all_timeslots:
  for district_id in district_ids:
    timeslot_r = map(lambda x: int(x), timeslot.split('-'))
    gap_data = {'timeslot': timeslot,
                'day_in_week': date(timeslot_r[0], timeslot_r[1], timeslot_r[2]).weekday(),
                'timeofday_slot': timeslot_r[3],
                'district_id': int(district_id),
                'gap': int(0)
               }
    poi_row = pois_df.loc[
      (pois_df['district_id'] == district_id)
    ]
    for poi_field in pois_fields:
      gap_data[poi_field] = poi_row.iloc[0][poi_field]
    prev_gaps = order_gaps_df.loc[
      (order_gaps_df['timeslot'].isin(prev_timeslots_dict[timeslot])) &
      (order_gaps_df['district_id'] == district_id)
    ]

    for i in range(prev_n_range+1):
      slot_diff = i
      past_timeslot = calc_past_timeslot(timeslot, slot_diff)
      prev_gap_loc = prev_gaps.loc[
        prev_gaps['timeslot'] == past_timeslot
      ]
      if not prev_gap_loc.empty:
        prev_gap = prev_gap_loc.iloc[0]
        if prev_gap['timeslot'] == timeslot:
          gap_data['gap'] = int(prev_gap['gap'])
          gap_data['busy_time'] = prev_gap['busy_time']
        else:
          gap_data['gap_{}_slots_ago'.format(slot_diff)] = int(prev_gap['gap'])
          gap_data['sum_price_{}_slots_ago'.format(slot_diff)] = prev_gap['sum_price']

      if slot_diff > 0:
        # There is a possibility that weather or traffic for previous
        # slot does not get recorded, which means we need to get
        # average weather conditions for other existing slots.

        traffic_row = traffic_df.loc[
          (traffic_df['timeslot'] == past_timeslot) &
          (traffic_df['district_id'] == district_id)
        ]
        if not traffic_row.empty:
          gap_data['tj_level1_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level1']
          gap_data['tj_level2_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level2']
          gap_data['tj_level3_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level3']
          gap_data['tj_level4_{}_slots_ago'.format(slot_diff)] = traffic_row.iloc[0]['tj_level4']
        else:
          # Traffic row is empty, lets get average of other three traffic rows
          traffic_rows = traffic_df.loc[
            (traffic_df['timeslot'].isin(prev_timeslots_dict[timeslot])) &
            (traffic_df['district_id'] == district_id)
          ]
          gap_data['tj_level1_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level1'].mean()
          gap_data['tj_level2_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level2'].mean()
          gap_data['tj_level3_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level3'].mean()
          gap_data['tj_level4_{}_slots_ago'.format(slot_diff)] = traffic_rows['tj_level4'].mean()

        weather_row = weather_df.loc[
          (weather_df['timeslot']==past_timeslot)
        ]
        if not weather_row.empty:
          gap_data['weather_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['weather']
          gap_data['temperature_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['temperature']
          gap_data['pm25_{}_slots_ago'.format(slot_diff)] = weather_row.iloc[0]['pm25']
        else:
          # Weather row is empty, lets get average of other three weather rows
          weather_rows = weather_df.loc[
            (weather_df['timeslot'].isin(prev_timeslots_dict[timeslot]))
          ]
          gap_data['weather_{}_slots_ago'.format(slot_diff)] = weather_rows['weather'].mean()
          gap_data['temperature_{}_slots_ago'.format(slot_diff)] = weather_rows['temperature'].mean()
          gap_data['pm25_{}_slots_ago'.format(slot_diff)] = weather_rows['pm25'].mean()
  
    gaps_df = gaps_df.append(gap_data, ignore_index=True)
    counter += 1
    if counter % 500 == 0:
      print 'entered {} rows'.format(counter)
gaps_df

In [None]:
gaps_df.iloc[0]

In [None]:
gaps_df['district_id'] = gaps_df['district_id'].astype('int64')
schema = []
for index, column in enumerate(columns):
  if index == 0:
    schema.append({'name': column, 'type': 'INTEGER'})
  elif index == 1:
    schema.append({'name': column, 'type': 'STRING'})
  else:
    schema.append({'name': column, 'type': 'FLOAT'})
table = bq.Table('datalab-projects-1331:xjk_algo_comp_test.gaps')
if not table.exists():
  table.create(bq.Schema(schema))

table.insert_data(gaps_df)