In [25]:
from cStringIO import StringIO
import gzip
import os
import glob
import re
import pdb

import gcp
from gcp import storage
from gcp import bigquery as bq
import pandas as pd

# Import zip file from Google Cloud Storage
project = gcp.Context.default().project_id
bucket_name = project + '-datalab'
bucket_path = 'gs://' + bucket_name + '/data'
print 'Bucket: ' + bucket_path
compressed_filename = 'citydata2'
season_name = 'season_2'
subdirectory_name = 'training_data'

Bucket: gs://datalab-projects-1331-datalab/data


In [26]:
# Downloading and extracting the files from cloud storage.

def process_datafile(localpath, storagepath, table, mode='create', overwrite=False, ):
  global compressed_filename
  gzip_filename = '{}.tar.gz'.format(compressed_filename)
  tar_filename = '{}.tar'.format(compressed_filename)
  datadir = 'season_2'
  storagepath_r = storagepath.split('/')
  # If data has not been extracted, extract it.
  if not os.path.isdir(datadir):
    # If citydata.tar has not been downloaded, download it
    if not os.path.isfile(tar_filename):
      
      bucket_object = '{}/{}.tar.gz'.format('/'.join(storagepath_r[:3]), compressed_filename)
      %storage read --object $bucket_object --variable compressed_file
      
      gzip_file = gzip.GzipFile(fileobj=StringIO(compressed_file))
      del compressed_file

      import shutil
      with open(tar_filename, 'wb') as f_out:
        shutil.copyfileobj(gzip_file, f_out)

    import tarfile
    tar = tarfile.open(tar_filename, "r")
    tar.extractall()
    tar.close()
    os.remove(tar_filename)

  # Upload extracted file into GCS
  bucketname = storagepath_r[2]
  itempath = '/'.join(storagepath_r[3:])
  item = storage.Item(bucketname, itempath)
  if not item.exists() or overwrite:    
    with open(localpath, 'rb') as f:
      item.write_to(f.read(), 'text/plain')
  # Load data into Google BigQuery
  table.load(storagepath, mode=mode, csv_options=bq.CSVOptions(delimiter='\t'))

# Process Training Data

## Districts

In [27]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.districts')
if not table.exists():
  schema = bq.Schema([
      {'name': 'district_hash', 'type': 'STRING'},
      {'name': 'district_id', 'type': 'INTEGER'}
    ])
  table.create(schema)
  
localpath = '{}/{}/cluster_map/cluster_map'.format(season_name, subdirectory_name)
storagepath = os.path.join(bucket_path,localpath)
process_datafile(localpath, storagepath, table, mode='append')

Undefined variable referenced in command line: $bucket_object


UnboundLocalError: local variable 'compressed_file' referenced before assignment

In [23]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.districts -m overwrite
SELECT LAST(district_hash) AS district_hash, district_id
FROM [datalab-projects-1331:xjk_algo_comp.districts]
GROUP BY district_id

district_hash,district_id
90c5a34f06ac86aee0fd70e2adce7d8a,1
f2c8c4bb99e6377d21de71275afd6cd2,2
58c7a4888306d8ff3a641d1c0feccbe3,3
b26a240205c852804ff8758628c0a86a,4
4b9e4cf2fbdc8281b8a1f9f12b80ce4d,5
1cbfbdd079ef93e74405c53fcfff8567,6
929ec6c160e6f52c20a4217c7978f681,7
82cc4851f9e4faa4e54309f8bb73fd7c,8
b702e920dcd2765e624dc1ce3a770512,9
4f4041f7db0c7f69892d9b74c1a7efa1,10


## Weather

In [14]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.weather')
if not table.exists():
  schema = bq.Schema([
      {'name': 'time', 'type': 'STRING'},
      {'name': 'weather', 'type': 'INTEGER'},
      {'name': 'temperature', 'type': 'FLOAT'},
      {'name': 'pm25', 'type': 'FLOAT'}
    ])
  table.create(schema)
  
wildpath = '{}/{}/weather_data/*'.format(season_name, subdirectory_name)
for localpath in glob.glob(wildpath):
  storagepath = os.path.join(bucket_path,localpath)
  process_datafile(localpath, storagepath, table, mode='append')

In [15]:
%%bigquery udf --module transform_weather_time

/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Transform timestamps of weather table into timeslots in weather table.
 *
 * @param {{time: string, weather: integer, temperature: float, pm25: float}} r
 * @param function({{time: string, weather: integer, temperature: float, pm25: float,
                     timeslot: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  emitFn(r);
}

In [20]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.weather -m overwrite
SELECT LAST(time) AS time, LAST(weather) AS weather,
  LAST(temperature) AS temperature, LAST(pm25) AS pm25, timeslot
FROM transform_weather_time([datalab-projects-1331:xjk_algo_comp.weather])
GROUP BY timeslot

time,weather,temperature,pm25,timeslot
2016-01-18 00:06:04,3,3.0,88.0,2016-01-18-1
2016-01-18 00:16:13,3,3.0,88.0,2016-01-18-2
2016-01-18 00:25:22,3,3.0,88.0,2016-01-18-3
2016-01-18 00:35:40,3,3.0,88.0,2016-01-18-4
2016-01-18 00:45:34,3,3.0,88.0,2016-01-18-5
2016-01-18 00:55:33,3,3.0,82.0,2016-01-18-6
2016-01-18 01:05:25,3,3.0,82.0,2016-01-18-7
2016-01-18 01:15:22,3,3.0,82.0,2016-01-18-8
2016-01-18 01:25:34,3,3.0,82.0,2016-01-18-9
2016-01-18 01:35:38,3,3.0,82.0,2016-01-18-10


## Traffic

In [7]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.traffic')
if not table.exists():
  schema = bq.Schema([
      {'name': 'district_hash', 'type': 'STRING'},
      {'name': 'tj_level1', 'type': 'INTEGER'},
      {'name': 'tj_level2', 'type': 'INTEGER'},
      {'name': 'tj_level3', 'type': 'INTEGER'},
      {'name': 'tj_level4', 'type': 'INTEGER'},
      {'name': 'tj_time', 'type': 'STRING'}
    ])
  table.create(schema)
  
wildpath = '{}/{}/traffic_data/*'.format(season_name, subdirectory_name)
for localpath in glob.glob(wildpath):
  with open(localpath, 'rb') as f:
    text = f.read()
  with open(localpath, 'wb') as f:
    f.write(re.sub(r'\b\t[0-9]:\b', '\t', text))
  storagepath = os.path.join(bucket_path,localpath)
  process_datafile(localpath, storagepath, table, mode='append', overwrite=True)

In [8]:
%%bigquery udf --module transform_traffic_time
  
/**
 * Pad with 0 or given string.
 *
 * @param int n Number to add padding to.
 * @param int width Width of number + padding.
 * @param string z (Optional) Other string to replace '0' as padding.
 */
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}

/**
 * Transform timestamps of weather table into timeslots in traffic table.
 *
 * @param {{district_hash: string, tj_level1: integer, tj_level2: integer, tj_level3: integer, 
            tj_level4: integer, tj_time: string}} r
 * @param function({{district_hash: string, tj_level1: integer, tj_level2: integer, tj_level3: integer, 
                     tj_level4: integer, tj_time: string, timeslot: string}}) emitFn
 */
function(r, emitFn) {
  var t = r.tj_time.split(/[ :\-]/);
  var slot = Math.floor((parseInt(t[3]) * 60 + parseInt(t[4])) / 10) + 1;
  r.timeslot = t[0] + '-' + pad(t[1], 2) +
               '-' + pad(t[2], 2) + '-' + slot;
  emitFn(r);
}

In [21]:
%%bigquery execute -t datalab-projects-1331:xjk_algo_comp.traffic -m overwrite
  
SELECT LAST(district_hash) AS district_hash, LAST(tj_level1) AS tj_level1,
  LAST(tj_level2) AS tj_level2, LAST(tj_level3) AS tj_level3, LAST(tj_level4) AS tj_level4,
  LAST(tj_time) AS tj_time, timeslot
FROM transform_traffic_time([datalab-projects-1331:xjk_algo_comp.traffic])
GROUP BY timeslot

district_hash,tj_level1,tj_level2,tj_level3,tj_level4,tj_time,timeslot
4725c39a5e5f4c188d382da3910b3f3f,3063,471,156,141,2016-01-16 00:10:27,2016-01-16-2
d4ec2125aff74eded207d2d915ef682f,3979,675,199,143,2016-01-16 00:20:26,2016-01-16-3
d4ec2125aff74eded207d2d915ef682f,3930,683,172,124,2016-01-16 00:30:26,2016-01-16-4
d4ec2125aff74eded207d2d915ef682f,4009,649,178,128,2016-01-16 00:40:27,2016-01-16-5
82cc4851f9e4faa4e54309f8bb73fd7c,2527,287,87,116,2016-01-16 00:50:26,2016-01-16-6
4725c39a5e5f4c188d382da3910b3f3f,2928,447,137,127,2016-01-16 01:00:24,2016-01-16-7
4725c39a5e5f4c188d382da3910b3f3f,2893,409,150,142,2016-01-16 01:10:24,2016-01-16-8
d4ec2125aff74eded207d2d915ef682f,3734,531,130,106,2016-01-16 01:20:24,2016-01-16-9
4725c39a5e5f4c188d382da3910b3f3f,2746,434,154,107,2016-01-16 01:30:25,2016-01-16-10
4725c39a5e5f4c188d382da3910b3f3f,2837,408,112,119,2016-01-16 01:40:23,2016-01-16-11


## POIs

In [10]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.pois')
  
localpath = '{}/{}/poi_data/poi_data'.format(season_name, subdirectory_name)

pois = []
pois_schema = [{'name': 'district_hash', 'type': 'STRING'}]
with open(localpath, 'rb') as f:
  for line in f:
    line_pois = map(lambda x: ['f{}'.format(x.split(':')[0].replace('#', '_')), x.split(':')[1]],
                    line.split('\t')[1:])
  for poi in line_pois:
    if poi[0] not in pois:
      pois.append(poi[0])
      pois_schema.append({'name': poi[0], 'type': 'INTEGER'})
pois.sort()
pois_schema = sorted(pois_schema, key=lambda k: k['name']) 

if not table.exists():
  schema = bq.Schema.from_data(pois_schema)
  table.create(schema)
  
  pois_data = pd.DataFrame(columns=['district_hash'] + pois)
  with open(localpath, 'rb') as f:
    for line in f:
      hash_pois = {}
      for poi_line in line.split('\t')[1:]:
        hash_pois['f{}'.format(poi_line.split(':')[0].replace('#', '_'))] = poi_line.split(':')[1]
      poi_data = [line.split('\t')[0]]
      # hash_pois = {f1_1: 15, ...}
      # pois = ['f1_1', ...]
      for poi in pois:
        value = '0'
        if poi in hash_pois:
          value = hash_pois[poi].strip()
        poi_data.append(value)
      pois_data.loc[len(pois_data)] = poi_data
  for poi in pois:
    pois_data[poi] = pd.to_numeric(pois_data[poi])
  table.insert_data(pois_data)

IOError: [Errno 2] No such file or directory: 'season_2/training_data/poi_data/poi_data'

Following code can be used to print out feature fields to be used when selecting from tables.

In [None]:
final_text = ''
for counter, poi_text in enumerate(map(lambda x: 'FIRST(pois.{}) AS {}'.format(x,x), pois)):
  if counter%3 == 0:
    final_text = '{}\n'.format(final_text)
  final_text = '{}{}, '.format(final_text, poi_text)
print final_text[1:(len(final_text)-2)]

## Orders

In [None]:
table = bq.Table('datalab-projects-1331:xjk_algo_comp.orders')
if not table.exists():
  schema = bq.Schema([{'name': 'order_id', 'type': 'STRING'},
                      {'name': 'driver_id', 'type': 'STRING'},
                      {'name': 'passenger_id', 'type': 'STRING'},
                      {'name': 'start_district_hash', 'type': 'STRING'},
                      {'name': 'dest_district_hash', 'type': 'STRING'},
                      {'name': 'price', 'type': 'FLOAT'},
                      {'name': 'time', 'type': 'STRING'}])
  table.create(schema)
  
  wildpath = '{}/{}/order_data/*'.format(season_name, subdirectory_name)
  for localpath in glob.glob(wildpath):
    print 'loading {}'.format(localpath)
    storagepath = os.path.join(bucket_path,localpath)
    process_datafile(localpath, storagepath, table, mode='append')