In [1]:
import pdb
import gcp
import gcp.bigquery as bq
import gcp.storage as storage
from cStringIO import StringIO
import gzip
import os
import glob
BQ_DATASET_NAME = 'datalab-projects-1331:xjk_algo_comp'
dataset = bq.DataSet(BQ_DATASET_NAME)

project = gcp.Context.default().project_id
bucket_name = project + '-datalab'
bucket_path = 'gs://' + bucket_name

In [None]:
%%sql --module districts
SELECT *
FROM [datalab-projects-1331:xjk_algo_comp.districts]

In [None]:
df_districts = bq.Query(districts).to_dataframe()
df_districts[:10]

# Download and extract data files from cloud storage
Data files are stored within a compressed file in Google Cloud Storage (GCS). We need to download it, extract the data files into local drive, then upload them back into GCS before being used in BigQuery

In [None]:
tar_filename = 'citydata.tar'
datadir = 'season_1'

# If data has not been extracted, extract it.
if not os.path.isdir(datadir):
  # If citydata.tar has not been downloaded, download it.
  if not os.path.isfile(tar_filename):
    
    # Import zip file from Google Cloud Storage
    bucket_object = bucket_path + '/data/citydata.tar.gz'
    print 'Bucket: ' + bucket_path
    print 'Object: ' + bucket_object

    %storage read --object $bucket_object --variable compressed_file
    
    gzip_file = gzip.GzipFile(fileobj=StringIO(compressed_file))
    
    del compressed_file

    import shutil
    with open(tar_filename, 'wb') as f_out:
      shutil.copyfileobj(gzip_file, f_out)

  import tarfile
  tar = tarfile.open(tar_filename, "r")
  tar.extractall()
  tar.close()
  os.remove(tar_filename)

# Copy all files to GCS and load them into BigQuery

In here we will copy the files currently locally stored to GCS then tell BigQuery to load them. BigQuery can't load directly from Datalab it seems so this is a workaround we need to do.

In [2]:
def load(localpath, storagepath, table):
  '''Copy data file located in local path to GCS storagepath, then into given BigQuery table.
  
  Args:
    localpath(string): Local data file path.
    storagepath(string): Google Cloud Storage's data file path 
                         e.g. "gs://project_name/data/datafile.csv"
    table(gcp.bigquery.Table): BigQuery table's `instance
      <http://googlecloudplatform.github.io/datalab/gcp.bigquery.html#gcp.bigquery.Table>`.
  '''
  fo = open(localpath, 'rb')
  file_str = fo.read()
  fo.close()
  bucketpath = '/'.join(storagepath.split('/')[0:3])
  keypath = '/'.join(storagepath.split('/')[3:])
  item = gcp.storage.Item(bucketpath, keypath, context=gcp.Context.default())
  item.write_to(file_str, 'text/plain')
  %storage write -v file_str -o $storagepath
#   tablename = '{}:{}.{}'.format(table.name.project_id, table.name.dataset_id, table.name.table_id)
#   %bigquery load -m append -f csv -S $storagepath -D $tablename -i

## Load Districts

In [None]:
table = bq.Table(BQ_DATASET_NAME + '.districts')

# Create or overwrite the existing table if it exists
table_schema = bq.Schema([{'name': 'district_hash', 'type': 'STRING'},
                          {'name': 'district_id', 'type': 'INTEGER'}])
table.create(schema = table_schema, overwrite = True)

path = r'season_1/training_data/cluster_map/cluster_map'

load(path, os.path.join(bucket_path, 'data', path), table)

## Load Orders
Loading orders data. Orders data are stored one day per csv file and we need to keep them all in one big **orders** table.

In [None]:
%%sql --module orders_count
SELECT count(*)
FROM [datalab-projects-1331:xjk_algo_comp.orders]

In [None]:
%bigquery load -h

In [None]:
path = r'data/season_1/training_data/order_data/order_data_2016-01-16'
orders_path = os.path.join(bucket_path, path)
print(orders_path)
dest = 'datalab-projects-1331:xjk_algo_comp.orders'
%bigquery load -m append -f csv -S $orders_path -D $dest -i

In [None]:
!gsutil

In [None]:
table = bq.Table(BQ_DATASET_NAME + '.orders')

# Create or overwrite the existing table if it exists
table_schema = bq.Schema([{'name': 'order_id', 'type': 'STRING'},
                          {'name': 'driver_id', 'type': 'STRING'},
                          {'name': 'passenger_id', 'type': 'STRING'},
                          {'name': 'start_district_hash', 'type': 'STRING'},
                          {'name': 'dest_district_hash', 'type': 'STRING'},
                          {'name': 'price', 'type': 'FLOAT'},
                          {'name': 'time', 'type': 'TIMESTAMP'}])
table.create(schema = table_schema, overwrite = True)

path = r'season_1/training_data/order_data'
all_files = glob.glob(os.path.join(path, '*'))

list_html = %storage list -o $bucket_path
list_str = list_html.data

for file_ in all_files:
  fo = open(file_, 'rb')
  file_str = fo.read()
  fo.close()
  datafile_path = os.path.join(bucket_path, file_)
  
  # If object does not exist...
  filename = os.path.basename(file_)
  print('reading', file_)
  if filename not in list_str: 
    # Upload to storage
    %storage write -v file_str -o $datafile_path

  %bigquery load -m append -S source -D orders --dataset xjk_algo_comp
#     df_ = pd.read_csv(file_,index_col=0, sep='\t',
#                       parse_dates=[6], date_parser=dateparse,
#   #                     converters={6:dateparse},
#                       header=None, names=['order_id', 'driver_id', 'passenger_id',
#                                           'start_district_hash', 'dest_district_hash',
#                                           'price', 'time'])



In [None]:
test = 'hello'
path = os.path.join(bucket_path, 'zing', 'test.txt')
print(path)
%storage write -v test -o $path

In [None]:
%bigquery load -h