In [1]:
from cStringIO import StringIO
import gzip
import os
import glob
import re
import pdb

import gcp
from gcp import storage
from gcp import bigquery as bq
import pandas as pd

# Import zip file from Google Cloud Storage
project = gcp.Context.default().project_id
bucket_name = project + '-datalab'
bucket_path = 'gs://' + bucket_name + '/data'
print 'Bucket: ' + bucket_path
compressed_filename = 'citydata'
season_name = 'season_2'
subdirectory_name = 'training_data'

Bucket: gs://datalab-projects-1331-datalab/data


In [3]:
gzip_filename = '{}.tar.gz'.format(compressed_filename)
tar_filename = '{}.tar'.format(compressed_filename)
compressed_file = None
datadir = season_name
# If data has not been extracted, extract it.
if not os.path.isdir(datadir):
  # If citydata.tar has not been downloaded, download it
  if not os.path.isfile(tar_filename):
    bucket_object = '{}/{}'.format(bucket_path, gzip_filename)

In [3]:
%%storage read --object $bucket_object --variable compressed_file

Undefined variable referenced in command line: $bucket_object


In [4]:
if compressed_file:
  gzip_file = gzip.GzipFile(fileobj=StringIO(compressed_file))
  del compressed_file

  import shutil
  with open(tar_filename, 'wb') as f_out:
    shutil.copyfileobj(gzip_file, f_out)
    
  import tarfile
  tar = tarfile.open(tar_filename, "r")
  tar.extractall()
  tar.close()
  os.remove(tar_filename)

In [5]:
# Downloading and extracting the files from cloud storage.

def process_datafile(localpath, storagepath, table, mode='create', overwrite=False):
  # Upload extracted file into GCS
  storagepath_r = storagepath.split('/')
  bucketname = storagepath_r[2]
  itempath = '/'.join(storagepath_r[3:])
  item = storage.Item(bucketname, itempath)
  if not item.exists() or overwrite:    
    with open(localpath, 'rb') as f:
      item.write_to(f.read(), 'text/plain')
  # Load data into Google BigQuery
  table.load(storagepath, mode=mode, csv_options=bq.CSVOptions(delimiter='\t'))

# Process Training Data

## Districts

In [10]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.districts_preprocessed')
if not table.exists():
  schema = bq.Schema([
      {'name': 'district_hash', 'type': 'STRING'},
      {'name': 'district_id', 'type': 'INTEGER'}
    ])
  table.create(schema)
  
localpath = '{}/{}/cluster_map/cluster_map'.format(season_name, subdirectory_name)
storagepath = os.path.join(bucket_path,localpath)
print storagepath
process_datafile(localpath, storagepath, table, mode='append')

gs://datalab-projects-1331-datalab/data/season_2/training_data/cluster_map/cluster_map


In [11]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.districts -m overwrite
SELECT LAST(district_hash) AS district_hash, district_id
FROM [datalab-projects-1331:xjk_algo_comp.districts_preprocessed]
GROUP BY district_id

district_hash,district_id
90c5a34f06ac86aee0fd70e2adce7d8a,1
f2c8c4bb99e6377d21de71275afd6cd2,2
58c7a4888306d8ff3a641d1c0feccbe3,3
b26a240205c852804ff8758628c0a86a,4
4b9e4cf2fbdc8281b8a1f9f12b80ce4d,5
1cbfbdd079ef93e74405c53fcfff8567,6
929ec6c160e6f52c20a4217c7978f681,7
82cc4851f9e4faa4e54309f8bb73fd7c,8
b702e920dcd2765e624dc1ce3a770512,9
4f4041f7db0c7f69892d9b74c1a7efa1,10


## Weather

In [12]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.weather_preprocessed')
if not table.exists():
  schema = bq.Schema([
      {'name': 'time', 'type': 'STRING'},
      {'name': 'weather', 'type': 'INTEGER'},
      {'name': 'temperature', 'type': 'FLOAT'},
      {'name': 'pm25', 'type': 'FLOAT'}
    ])
  table.create(schema)
  
wildpath = '{}/{}/weather_data/*'.format(season_name, subdirectory_name)
for localpath in glob.glob(wildpath):
  print 'process {}'.format(storagepath)
  storagepath = os.path.join(bucket_path,localpath)
  process_datafile(localpath, storagepath, table, mode='append')

process gs://datalab-projects-1331-datalab/data/season_2/training_data/cluster_map/cluster_map
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-08
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-16
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-13
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-04
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-01
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-19
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-21
process gs://datalab-projects-1331-datalab/data/season_2/training_data/weather_data/weather_data_2016-01-17
process gs://datalab-projects-1331-datala

In [13]:
%%bigquery udf --module transform_weather_time

/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Transform timestamps of weather table into timeslots in weather table.
 *
 * @param {{time: string, weather: integer, temperature: float, pm25: float}} r
 * @param function({{time: string, weather: integer, temperature: float, pm25: float,
                     timeslot: string, timeofday_slot: integer, day_in_week: integer,
                     date: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  r.timeofday_slot = slot;
  r.date = t[0] + '-' + pad(t[1], 2) + '-' + pad(t[2], 2);
  r.day_in_week = new Date(parseInt(t[0]), parseInt(t[1])-1, parseInt(t[2])).getDay();
  emitFn(r);
}

In [14]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.weather -m overwrite
SELECT LAST(time) AS time, LAST(weather) AS weather,
  LAST(temperature) AS temperature, LAST(pm25) AS pm25, timeslot,
  LAST(timeofday_slot) AS timeofday_slot, LAST(day_in_week) AS day_in_week,
  LAST(date) AS date
FROM transform_weather_time([datalab-projects-1331:xjk_algo_comp.weather_preprocessed])
GROUP BY timeslot

Query exceeded resource limits for tier 1. Tier 3 or higher required.


## Traffic

In [15]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.traffic_preprocessed')
if not table.exists():
  schema = bq.Schema([
      {'name': 'district_hash', 'type': 'STRING'},
      {'name': 'tj_level1', 'type': 'INTEGER'},
      {'name': 'tj_level2', 'type': 'INTEGER'},
      {'name': 'tj_level3', 'type': 'INTEGER'},
      {'name': 'tj_level4', 'type': 'INTEGER'},
      {'name': 'tj_time', 'type': 'STRING'}
    ])
  table.create(schema)
  
wildpath = '{}/{}/traffic_data/*'.format(season_name, subdirectory_name)
for localpath in glob.glob(wildpath):
  with open(localpath, 'rb') as f:
    text = f.read()
  with open(localpath, 'wb') as f:
    f.write(re.sub(r'\b\t[0-9]:\b', '\t', text))
  storagepath = os.path.join(bucket_path,localpath)
  print 'process {}'.format(storagepath)
  process_datafile(localpath, storagepath, table, mode='append', overwrite=True)

process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-19
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-18
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-09
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-05
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-10
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-15
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-08
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-12
process gs://datalab-projects-1331-datalab/data/season_2/training_data/traffic_data/traffic_data_2016-01-20
process gs://datalab-project

In [16]:
%%bigquery udf --module transform_traffic_time
  
/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Transform timestamps of weather table into timeslots in traffic table.
 *
 * @param {{district_hash: string, tj_level1: integer, tj_level2: integer, tj_level3: integer, 
            tj_level4: integer, tj_time: string}} r
 * @param function({{district_hash: string, tj_level1: integer, tj_level2: integer, tj_level3: integer, 
                     tj_level4: integer, tj_time: string, timeslot: string, timeofday_slot: integer,
                     day_in_week: integer, date: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.tj_time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  r.timeofday_slot = slot;
  r.date = t[0] + '-' + pad(t[1], 2) + '-' + pad(t[2], 2);
  r.day_in_week = new Date(parseInt(t[0]), parseInt(t[1])-1, parseInt(t[2])).getDay();
  emitFn(r);
}

In [17]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.traffic -m overwrite
  
SELECT district_hash, AVG(tj_level1) AS tj_level1,
  AVG(tj_level2) AS tj_level2, AVG(tj_level3) AS tj_level3, AVG(tj_level4) AS tj_level4,
  LAST(tj_time) AS tj_time, timeslot,
  LAST(timeofday_slot) AS timeofday_slot, LAST(day_in_week) AS day_in_week,
  LAST(date) AS date
FROM transform_traffic_time([datalab-projects-1331:xjk_algo_comp.traffic_preprocessed])
GROUP BY district_hash, timeslot

Query exceeded resource limits for tier 1. Tier 2 or higher required.


## POIs

In [18]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.pois_preprocessed')
  
localpath = '{}/{}/poi_data/poi_data'.format(season_name, subdirectory_name)

pois = []
pois_schema = [{'name': 'district_hash', 'type': 'STRING'}]
with open(localpath, 'rb') as f:
  for line in f:
    line_pois = map(lambda x: ['f{}'.format(x.split(':')[0].replace('#', '_')), x.split(':')[1]],
                    line.split('\t')[1:])
  for poi in line_pois:
    if poi[0] not in pois:
      pois.append(poi[0])
      pois_schema.append({'name': poi[0], 'type': 'INTEGER'})
pois.sort()
pois_schema = sorted(pois_schema, key=lambda k: k['name']) 

if not table.exists():
  schema = bq.Schema.from_data(pois_schema)
  table.create(schema)
  
  pois_data = pd.DataFrame(columns=['district_hash'] + pois)
  with open(localpath, 'rb') as f:
    for line in f:
      hash_pois = {}
      for poi_line in line.split('\t')[1:]:
        hash_pois['f{}'.format(poi_line.split(':')[0].replace('#', '_'))] = poi_line.split(':')[1]
      poi_data = [line.split('\t')[0]]
      # hash_pois = {f1_1: 15, ...}
      # pois = ['f1_1', ...]
      for poi in pois:
        value = '0'
        if poi in hash_pois:
          value = hash_pois[poi].strip()
        poi_data.append(value)
      pois_data.loc[len(pois_data)] = poi_data
  for poi in pois:
    pois_data[poi] = pd.to_numeric(pois_data[poi])
  table.insert_data(pois_data)

Following code can be used to print out feature fields to be used when selecting from tables.

In [19]:
final_text = ''
for counter, poi_text in enumerate(map(lambda x: 'LAST(pois.{}) AS {}'.format(x,x), pois)):
  if counter%3 == 0:
    final_text = '{}\n'.format(final_text)
  final_text = '{}{}, '.format(final_text, poi_text)
print final_text[1:(len(final_text)-2)]

LAST(pois.f1) AS f1, LAST(pois.f11) AS f11, LAST(pois.f11_1) AS f11_1, 
LAST(pois.f11_2) AS f11_2, LAST(pois.f11_3) AS f11_3, LAST(pois.f11_4) AS f11_4, 
LAST(pois.f11_5) AS f11_5, LAST(pois.f11_6) AS f11_6, LAST(pois.f11_7) AS f11_7, 
LAST(pois.f11_8) AS f11_8, LAST(pois.f13_4) AS f13_4, LAST(pois.f13_8) AS f13_8, 
LAST(pois.f14) AS f14, LAST(pois.f14_1) AS f14_1, LAST(pois.f14_10) AS f14_10, 
LAST(pois.f14_2) AS f14_2, LAST(pois.f14_3) AS f14_3, LAST(pois.f14_6) AS f14_6, 
LAST(pois.f14_8) AS f14_8, LAST(pois.f15) AS f15, LAST(pois.f15_1) AS f15_1, 
LAST(pois.f15_2) AS f15_2, LAST(pois.f15_3) AS f15_3, LAST(pois.f15_4) AS f15_4, 
LAST(pois.f15_6) AS f15_6, LAST(pois.f15_7) AS f15_7, LAST(pois.f15_8) AS f15_8, 
LAST(pois.f16) AS f16, LAST(pois.f16_1) AS f16_1, LAST(pois.f16_10) AS f16_10, 
LAST(pois.f16_11) AS f16_11, LAST(pois.f16_12) AS f16_12, LAST(pois.f16_3) AS f16_3, 
LAST(pois.f16_4) AS f16_4, LAST(pois.f16_6) AS f16_6, LAST(pois.f17) AS f17, 
LAST(pois.f17_2) AS f17_2, LAST(po

In [20]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.pois -m overwrite
  
SELECT district_hash,
LAST(pois.f1) AS f1, LAST(pois.f11) AS f11, LAST(pois.f11_1) AS f11_1, 
LAST(pois.f11_2) AS f11_2, LAST(pois.f11_3) AS f11_3, LAST(pois.f11_4) AS f11_4, 
LAST(pois.f11_5) AS f11_5, LAST(pois.f11_6) AS f11_6, LAST(pois.f11_7) AS f11_7, 
LAST(pois.f11_8) AS f11_8, LAST(pois.f13_4) AS f13_4, LAST(pois.f13_8) AS f13_8, 
LAST(pois.f14) AS f14, LAST(pois.f14_1) AS f14_1, LAST(pois.f14_10) AS f14_10, 
LAST(pois.f14_2) AS f14_2, LAST(pois.f14_3) AS f14_3, LAST(pois.f14_6) AS f14_6, 
LAST(pois.f14_8) AS f14_8, LAST(pois.f15) AS f15, LAST(pois.f15_1) AS f15_1, 
LAST(pois.f15_2) AS f15_2, LAST(pois.f15_3) AS f15_3, LAST(pois.f15_4) AS f15_4, 
LAST(pois.f15_6) AS f15_6, LAST(pois.f15_7) AS f15_7, LAST(pois.f15_8) AS f15_8, 
LAST(pois.f16) AS f16, LAST(pois.f16_1) AS f16_1, LAST(pois.f16_10) AS f16_10, 
LAST(pois.f16_11) AS f16_11, LAST(pois.f16_12) AS f16_12, LAST(pois.f16_3) AS f16_3, 
LAST(pois.f16_4) AS f16_4, LAST(pois.f16_6) AS f16_6, LAST(pois.f17) AS f17, 
LAST(pois.f17_2) AS f17_2, LAST(pois.f17_3) AS f17_3, LAST(pois.f17_4) AS f17_4, 
LAST(pois.f17_5) AS f17_5, LAST(pois.f19) AS f19, LAST(pois.f19_1) AS f19_1, 
LAST(pois.f19_2) AS f19_2, LAST(pois.f19_3) AS f19_3, LAST(pois.f19_4) AS f19_4, 
LAST(pois.f1_1) AS f1_1, LAST(pois.f1_10) AS f1_10, LAST(pois.f1_11) AS f1_11, 
LAST(pois.f1_2) AS f1_2, LAST(pois.f1_3) AS f1_3, LAST(pois.f1_4) AS f1_4, 
LAST(pois.f1_5) AS f1_5, LAST(pois.f1_6) AS f1_6, LAST(pois.f1_7) AS f1_7, 
LAST(pois.f1_8) AS f1_8, LAST(pois.f20) AS f20, LAST(pois.f20_1) AS f20_1, 
LAST(pois.f20_2) AS f20_2, LAST(pois.f20_4) AS f20_4, LAST(pois.f20_5) AS f20_5, 
LAST(pois.f20_6) AS f20_6, LAST(pois.f20_7) AS f20_7, LAST(pois.f20_8) AS f20_8, 
LAST(pois.f20_9) AS f20_9, LAST(pois.f21_1) AS f21_1, LAST(pois.f21_2) AS f21_2, 
LAST(pois.f22) AS f22, LAST(pois.f22_1) AS f22_1, LAST(pois.f22_2) AS f22_2, 
LAST(pois.f22_3) AS f22_3, LAST(pois.f22_4) AS f22_4, LAST(pois.f22_5) AS f22_5, 
LAST(pois.f23) AS f23, LAST(pois.f23_1) AS f23_1, LAST(pois.f23_2) AS f23_2, 
LAST(pois.f23_3) AS f23_3, LAST(pois.f23_4) AS f23_4, LAST(pois.f23_5) AS f23_5, 
LAST(pois.f23_6) AS f23_6, LAST(pois.f24) AS f24, LAST(pois.f24_1) AS f24_1, 
LAST(pois.f24_2) AS f24_2, LAST(pois.f24_3) AS f24_3, LAST(pois.f25) AS f25, 
LAST(pois.f25_1) AS f25_1, LAST(pois.f25_3) AS f25_3, LAST(pois.f25_7) AS f25_7, 
LAST(pois.f25_8) AS f25_8, LAST(pois.f25_9) AS f25_9, LAST(pois.f2_1) AS f2_1, 
LAST(pois.f2_10) AS f2_10, LAST(pois.f2_11) AS f2_11, LAST(pois.f2_12) AS f2_12, 
LAST(pois.f2_13) AS f2_13, LAST(pois.f2_2) AS f2_2, LAST(pois.f2_4) AS f2_4, 
LAST(pois.f2_5) AS f2_5, LAST(pois.f2_6) AS f2_6, LAST(pois.f2_7) AS f2_7, 
LAST(pois.f2_8) AS f2_8, LAST(pois.f3_1) AS f3_1, LAST(pois.f3_2) AS f3_2, 
LAST(pois.f3_3) AS f3_3, LAST(pois.f4) AS f4, LAST(pois.f4_1) AS f4_1, 
LAST(pois.f4_10) AS f4_10, LAST(pois.f4_11) AS f4_11, LAST(pois.f4_13) AS f4_13, 
LAST(pois.f4_14) AS f4_14, LAST(pois.f4_16) AS f4_16, LAST(pois.f4_17) AS f4_17, 
LAST(pois.f4_18) AS f4_18, LAST(pois.f4_2) AS f4_2, LAST(pois.f4_3) AS f4_3, 
LAST(pois.f4_5) AS f4_5, LAST(pois.f4_6) AS f4_6, LAST(pois.f4_7) AS f4_7, 
LAST(pois.f4_8) AS f4_8, LAST(pois.f4_9) AS f4_9, LAST(pois.f5) AS f5, 
LAST(pois.f5_1) AS f5_1, LAST(pois.f5_3) AS f5_3, LAST(pois.f5_4) AS f5_4, 
LAST(pois.f6) AS f6, LAST(pois.f6_1) AS f6_1, LAST(pois.f6_2) AS f6_2, 
LAST(pois.f6_4) AS f6_4, LAST(pois.f7) AS f7, LAST(pois.f8) AS f8, 
LAST(pois.f8_1) AS f8_1, LAST(pois.f8_2) AS f8_2, LAST(pois.f8_3) AS f8_3, 
LAST(pois.f8_4) AS f8_4, LAST(pois.f8_5) AS f8_5
FROM [datalab-projects-1331:xjk_algo_comp.pois_preprocessed] AS pois
GROUP BY district_hash

district_hash,f1,f11,f11_1,f11_2,f11_3,f11_4,f11_5,f11_6,f11_7,f11_8,f13_4,f13_8,f14,f14_1,f14_10,f14_2,f14_3,f14_6,f14_8,f15,f15_1,f15_2,f15_3,f15_4,f15_6,f15_7,f15_8,f16,f16_1,f16_10,f16_11,f16_12,f16_3,f16_4,f16_6,f17,f17_2,f17_3,f17_4,f17_5,f19,f19_1,f19_2,f19_3,f19_4,f1_1,f1_10,f1_11,f1_2,f1_3,f1_4,f1_5,f1_6,f1_7,f1_8,f20,f20_1,f20_2,f20_4,f20_5,f20_6,f20_7,f20_8,f20_9,f21_1,f21_2,f22,f22_1,f22_2,f22_3,f22_4,f22_5,f23,f23_1,f23_2,f23_3,f23_4,f23_5,f23_6,f24,f24_1,f24_2,f24_3,f25,f25_1,f25_3,f25_7,f25_8,f25_9,f2_1,f2_10,f2_11,f2_12,f2_13,f2_2,f2_4,f2_5,f2_6,f2_7,f2_8,f3_1,f3_2,f3_3,f4,f4_1,f4_10,f4_11,f4_13,f4_14,f4_16,f4_17,f4_18,f4_2,f4_3,f4_5,f4_6,f4_7,f4_8,f4_9,f5,f5_1,f5_3,f5_4,f6,f6_1,f6_2,f6_4,f7,f8,f8_1,f8_2,f8_3,f8_4,f8_5
3e12208dd0be281c92a6ab57d9a6fb32,415,1245,83,166,249,3486,83,830,332,6142,1660,332,0,0,0,0,0,83,83,747,0,415,830,166,830,415,166,498,0,2241,249,1079,415,1245,0,83,415,0,0,1079,10458,747,498,1826,166,0,0,332,249,249,0,2407,249,0,249,1743,3652,913,1079,2075,0,1079,5395,0,0,0,498,0,332,0,332,332,0,0,0,83,0,332,0,83,1660,498,0,332,0,0,332,83,1826,0,830,0,0,0,166,0,415,83,83,0,3984,0,0,747,249,0,332,166,498,83,166,498,830,83,166,83,0,0,0,0,166,332,83,4233,83,166,0,3735,83,0,1162,0,1079,0
82cc4851f9e4faa4e54309f8bb73fd7c,6142,22161,3735,14193,13031,70052,2324,29880,6142,60258,65321,5976,1909,0,830,83,1245,996,581,14193,332,8715,14193,996,6889,4316,249,12616,166,24983,15936,2822,747,8300,0,2324,19747,747,6557,24900,145665,2407,1162,80676,2490,664,830,4980,1660,249,415,21082,83,166,11703,56440,15853,12782,10043,7055,249,27556,103335,83,0,0,2739,1411,3818,1494,2490,415,4980,664,166,1079,830,2656,1660,6308,240036,11122,2490,4897,83,913,4648,2324,9130,830,4399,996,15687,4233,16434,664,2241,6640,1992,0,2905,249,415,22576,1494,7636,7055,6889,4150,498,3652,4980,5727,4233,8881,996,1411,249,5478,747,8383,5727,0,16434,3486,2407,747,13197,996,498,13446,5893,9628,0
cb6041cc08444746caf6039d8b9e43cb,249,747,0,0,0,913,0,166,83,2407,83,0,249,0,83,0,0,0,0,332,0,498,664,0,166,0,0,415,0,166,0,664,249,664,0,83,664,0,83,249,10541,1328,415,1577,249,0,83,0,83,0,0,498,83,83,249,1328,3735,415,747,249,0,2075,1245,0,83,83,83,0,415,0,415,83,0,0,0,0,0,249,0,0,166,83,0,249,0,0,166,0,913,0,498,0,166,0,83,0,166,83,0,0,3901,0,0,498,249,0,249,0,83,83,249,249,415,0,0,83,0,0,0,0,83,332,249,498,0,0,0,1577,0,0,166,83,83,0
a735449c5c09df639c35a7d61fad3ee5,0,0,0,0,0,0,0,0,0,0,83,0,0,0,0,0,0,83,0,83,0,0,0,0,0,0,0,0,0,0,0,0,83,83,0,0,0,0,0,0,581,166,0,0,0,0,0,0,0,0,0,0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,83,0,0,83,0,83,0,83,0,0,0,0,0,0,0,0,0,498,0,0,0,0,0,0,0,0,0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,166,0,0,0,0,83,0
b05379ac3f9b7d99370d443cfd5dcc28,2656,13363,1826,4482,6557,42828,1494,12782,3486,33781,18343,498,249,0,249,83,415,249,83,3735,83,3320,4814,249,3154,1494,166,5312,0,11371,10209,3403,581,4980,0,1411,10375,498,2573,11122,72044,1411,498,21331,2324,166,332,2822,1079,83,249,10458,0,166,2739,37433,9130,6308,7802,3403,415,45982,61005,0,83,0,2324,830,1909,1162,1577,83,2241,166,0,332,83,664,0,1411,77024,6723,332,1826,0,498,2490,1909,5561,415,2656,332,7968,1577,7387,166,913,3237,996,0,5644,249,166,9628,747,3237,3486,2905,1909,166,2075,2158,1743,1577,3403,830,498,83,1992,498,3984,2739,83,7221,1743,747,166,6640,0,249,6474,3237,3818,0
44c097b7bd219d104050abbafe51bd49,83,1494,0,166,415,2988,83,1494,332,2075,498,0,0,0,0,83,83,83,0,747,166,664,1577,249,664,332,0,830,0,415,415,830,166,1909,83,249,747,0,166,498,10707,664,166,1577,747,83,83,83,83,0,0,498,83,0,249,2075,1743,498,1162,166,0,3154,2739,0,0,0,83,0,166,0,166,166,0,83,0,0,0,415,0,166,747,332,83,249,0,0,415,0,581,166,83,0,415,415,83,83,249,332,332,0,3486,0,83,830,249,249,83,0,83,83,249,249,332,0,83,0,166,0,249,0,498,83,0,1328,83,83,83,2490,0,0,83,83,332,0
73ff8ef735e1d68f0cdcbb84d788f2b6,166,249,83,415,166,1162,0,498,332,1743,747,0,0,0,0,0,332,332,0,249,0,166,415,0,0,83,0,249,0,498,332,581,332,996,0,0,332,0,83,166,8134,664,332,996,415,0,83,83,0,0,0,249,0,0,83,581,1992,415,166,332,83,581,498,0,0,0,83,0,332,0,332,0,83,0,0,0,0,83,0,83,1494,0,0,83,0,0,332,0,664,0,498,166,0,0,83,83,415,83,0,0,4482,0,0,747,0,0,166,249,166,0,332,498,166,166,83,0,0,0,0,0,249,0,166,166,0,0,0,1992,0,0,249,0,166,0
38d5ad2d22b61109fd8e7b43cd0e8901,1494,7304,249,2075,1909,24485,581,6723,2241,25149,9960,332,83,0,664,249,415,249,0,2739,0,1245,2656,83,1079,747,0,4150,0,5478,3569,4233,581,3984,83,1577,8300,332,1079,6308,40338,3320,249,12284,1245,249,83,2075,498,166,83,10043,249,0,2905,17845,10458,3818,5893,4565,332,15770,41832,0,0,0,4814,332,2324,830,3652,1162,332,0,0,249,0,332,83,1079,51792,2324,166,830,0,0,2905,747,5893,83,2656,249,4067,664,5063,332,664,1079,581,0,6225,166,83,6225,415,1162,1992,2822,1577,332,1494,1826,3320,1743,1992,166,166,498,664,249,913,1743,0,6474,1162,83,0,7387,166,83,3735,830,1826,0
1c60154546102e6525f68cb4f31e0657,0,83,0,83,83,996,0,166,0,1079,747,166,0,0,0,0,0,0,0,0,0,0,83,0,0,0,0,166,0,249,166,498,83,581,0,0,166,0,0,0,4233,0,0,913,0,0,0,166,0,83,0,249,0,0,0,2324,664,83,415,0,166,249,10956,0,0,0,166,0,0,0,0,166,0,0,0,0,0,0,0,83,664,0,0,0,0,0,0,0,166,0,83,0,0,0,415,0,83,83,0,0,166,0,0,0,0,0,0,0,0,0,166,0,1245,0,0,0,0,0,0,0,0,0,0,166,83,0,83,2241,0,0,83,0,0,0
91690261186ae5bee8f83808ea1e4a01,498,3901,415,913,332,8881,166,3071,415,8300,5395,664,1743,0,83,83,166,913,83,1577,0,830,2407,166,1079,498,0,2822,0,4897,5810,1826,332,3901,0,166,2324,83,83,2407,45401,2075,249,8217,3071,0,83,664,83,83,83,2241,0,0,1245,5893,4067,1494,3486,1577,249,5976,15604,0,166,83,2158,498,996,1245,1411,498,415,0,0,415,0,332,83,332,11039,1909,166,498,83,83,1245,498,2158,166,913,83,1577,0,1328,0,498,415,249,0,3237,0,249,3320,415,1162,996,747,1162,166,830,1162,1162,0,1743,332,415,166,249,166,913,83,166,3237,332,166,332,5976,0,83,2324,1079,1328,83


## Orders

In [21]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.orders_preprocessed')
if not table.exists():
  schema = bq.Schema([{'name': 'order_id', 'type': 'STRING'},
                      {'name': 'driver_id', 'type': 'STRING'},
                      {'name': 'passenger_id', 'type': 'STRING'},
                      {'name': 'start_district_hash', 'type': 'STRING'},
                      {'name': 'dest_district_hash', 'type': 'STRING'},
                      {'name': 'price', 'type': 'FLOAT'},
                      {'name': 'time', 'type': 'STRING'}])
  table.create(schema)
  
wildpath = '{}/{}/order_data/*'.format(season_name, subdirectory_name)
for localpath in glob.glob(wildpath):
  print 'loading {}'.format(localpath)
  storagepath = os.path.join(bucket_path,localpath)
  process_datafile(localpath, storagepath, table, mode='append')

loading season_2/training_data/order_data/order_data_2016-01-16
loading season_2/training_data/order_data/order_data_2016-01-06
loading season_2/training_data/order_data/order_data_2016-01-21
loading season_2/training_data/order_data/order_data_2016-01-14
loading season_2/training_data/order_data/order_data_2016-01-02
loading season_2/training_data/order_data/order_data_2016-01-04
loading season_2/training_data/order_data/order_data_2016-01-05
loading season_2/training_data/order_data/order_data_2016-01-17
loading season_2/training_data/order_data/order_data_2016-01-09
loading season_2/training_data/order_data/order_data_2016-01-15
loading season_2/training_data/order_data/order_data_2016-01-12
loading season_2/training_data/order_data/order_data_2016-01-08
loading season_2/training_data/order_data/order_data_2016-01-18
loading season_2/training_data/order_data/order_data_2016-01-20
loading season_2/training_data/order_data/order_data_2016-01-11
loading season_2/training_data/order_dat

In [25]:
%%bigquery udf --module orders_create_additional_fields

/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Create additional fields on orders table for gaps table creation.
 *
 * @param {{order_id: string, driver_id: string, passenger_id: string,
            start_district_hash: string, dest_district_hash: string, price: float,
            time: string}} r
 * @param function({{order_id: string, driver_id: string, passenger_id: string,
                     start_district_hash: string, dest_district_hash: string, price: float,
                     time: string, timeslot: string, timeofday_slot: integer, day_in_week: integer,
                     date: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  r.timeofday_slot = slot;
  r.date = t[0] + '-' + pad(t[1], 2) + '-' + pad(t[2], 2);
  r.day_in_week = new Date(parseInt(t[0]), parseInt(t[1])-1, parseInt(t[2])).getDay();
  emitFn(r);
}

In [26]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.orders -m overwrite
SELECT order_id, driver_id, passenger_id,
  start_district_hash, dest_district_hash,
  price, time, timeslot, timeofday_slot, day_in_week, date
FROM orders_create_additional_fields([datalab-projects-1331:xjk_algo_comp.orders_preprocessed])

Response too large to return. Consider setting allowLargeResults to true in your job configuration. For more information, see https://cloud.google.com/bigquery/troubleshooting-errors
