In [1]:
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
from sklearn.pipeline import Pipeline
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# First feature HAS to be 'district_id' for MAPE calculation.
fields = ['gap', 'district_id', 'timeofday_slot', 'day_in_week', 'sum_price', 'traffic_tj_level1',
            'traffic_tj_level2', 'traffic_tj_level3', 'traffic_tj_level4']
features = fields[1:]

# Scorer Creation (MAPE)

In [None]:
def mape(X, predictions, y):
  num_timeslots = 43
  num_districts = 66
  if len(y.shape) == 1:
    y = np.asmatrix(y)
  if len(predictions.shape) == 1:
    predictions = np.asmatrix(predictions)
  Xy = np.concatenate((X, y.T, predictions.T), axis=1)
  districts = np.unique(X[:,0])
  district_scores = np.zeros(len(districts))
  for counter, key in enumerate(districts):
    group = np.compress((Xy[:,0] == key).flat, Xy, axis=0)
    district_scores[counter] = np.sum(np.absolute(
        (group[:,-2] -
         group[:,-1])/
        group[:,-2]
      )) / num_timeslots
  return np.sum(district_scores) / num_districts

def mape_scorer(estimator, X, y):
  predictions = estimator.predict(X)
  return -mape(X, predictions, y)

Testing MAPE

In [None]:
from sklearn.linear_model import LogisticRegression

est = LogisticRegression()
X = np.array([[1, 1], [1, 2], [2, 3], [2, 4]])
predictions = np.array([1, 2, 3, 4])
y = np.array([1, 2, 3, 4])

# Should return 0.0
mape(X, predictions, y)

# Feature Selection

In [None]:
%%sql --module q_all

SELECT *, HASH(CAST(district_id AS STRING) +timeslot) AS hash_value,
  IF(ABS(HASH(CAST(district_id AS STRING) + timeslot)) % 2 == 1, 'True', 'False')
    AS included_in_sample
FROM [datalab-projects-1331:xjk_algo_comp.future_gaps_processed]
WHERE gap > 0

# The above query randomizes its outputs.

In [None]:
query = bq.Query(q_all)
tableresult = query.results()

In [None]:
all_data = np.zeros((tableresult.length, len(fields)))
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  for fcounter, field in enumerate(fields):
    all_data[rcounter, fcounter] = row[field]
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)

In [None]:
# Get timeslots to test from GCS
item = storage.Item('datalab-projects-1331-datalab','data/timeslots_to_test.txt')
timeslots_to_test = item.read_from().strip().split('\n')
tquery = ','.join(map(lambda x: "'{}'".format(x), timeslots_to_test))
print(tquery)

In [None]:
%%sql --module q_all_t

SELECT *
FROM [datalab-projects-1331:xjk_algo_comp_test.future_gaps_processed]
WHERE gap > 0 AND timeslot NOT IN ('2016-01-22-46','2016-01-22-58','2016-01-22-70','2016-01-22-82',
    '2016-01-22-94','2016-01-22-106','2016-01-22-118','2016-01-22-130','2016-01-22-142',
    '2016-01-24-58','2016-01-24-70','2016-01-24-82','2016-01-24-94','2016-01-24-106',
    '2016-01-24-118','2016-01-24-130','2016-01-24-142','2016-01-26-46','2016-01-26-58',
    '2016-01-26-70','2016-01-26-82','2016-01-26-94','2016-01-26-106','2016-01-26-118',
    '2016-01-26-130','2016-01-26-142','2016-01-28-58','2016-01-28-70','2016-01-28-82',
    '2016-01-28-94','2016-01-28-106','2016-01-28-118','2016-01-28-130','2016-01-28-142',
    '2016-01-30-46','2016-01-30-58','2016-01-30-70','2016-01-30-82','2016-01-30-94',
    '2016-01-30-106','2016-01-30-118','2016-01-30-130','2016-01-30-142')
ORDER BY timeslot, district_id

# Test dataset - used to check if estimator can generalize well to new data.

In [None]:
query_t = bq.Query(q_all_t)
tableresult_t = query_t.results()

In [None]:
all_data_t = np.zeros((tableresult_t.length, len(fields)))
print 'there are {} rows'.format(tableresult_t.length)
for rcounter, row in enumerate(tableresult_t):
  for fcounter, field in enumerate(fields):
    all_data_t[rcounter, fcounter] = row[field]
  if rcounter % 1000 == 0:
    print 'processed {} rows'.format(rcounter)

# Run Prediction and Export CSV