In [1]:
import pdb
import numpy as np
from tables import *
import gcp.bigquery as bq
import gcp.storage as storage
from sklearn.metrics import make_scorer

try:
  import cPickle as pickle
except:
  import pickle
FIELDS_PICKLE = 'fields-4.pkl'
DATAFILE_PATH = 'xjk_pytable.h5'

seed = 13
np.random.seed(seed)

fields = pickle.load(open(FIELDS_PICKLE, "r") )
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

print "Number of features: {}".format(len(features))
print "Features:"
print features

Number of features: 163
Features:
['timeofday_slot', 'day_in_week', 'weather_1_slots_ago', 'weather_2_slots_ago', 'weather_3_slots_ago', 'tj_level1_1_slots_ago', 'tj_level2_1_slots_ago', 'tj_level3_1_slots_ago', 'tj_level4_1_slots_ago', 'tj_level1_2_slots_ago', 'tj_level2_2_slots_ago', 'tj_level3_2_slots_ago', 'tj_level4_2_slots_ago', 'tj_level1_3_slots_ago', 'tj_level2_3_slots_ago', 'tj_level3_3_slots_ago', 'tj_level4_3_slots_ago', 'temperature_1_slots_ago', 'pm25_1_slots_ago', 'temperature_2_slots_ago', 'pm25_2_slots_ago', 'temperature_3_slots_ago', 'pm25_3_slots_ago', 'gap_1_slots_ago', 'sum_price_1_slots_ago', 'gap_2_slots_ago', 'sum_price_2_slots_ago', 'gap_3_slots_ago', 'sum_price_3_slots_ago', 'f1', 'f11', 'f11_1', 'f11_2', 'f11_3', 'f11_4', 'f11_5', 'f11_6', 'f11_7', 'f11_8', 'f13_4', 'f13_8', 'f14', 'f14_1', 'f14_10', 'f14_2', 'f14_3', 'f14_6', 'f14_8', 'f15', 'f15_1', 'f15_2', 'f15_3', 'f15_4', 'f15_6', 'f15_7', 'f15_8', 'f16', 'f16_1', 'f16_10', 'f16_11', 'f16_12', 'f16_3', 

# Load Dataset

In [2]:
fileh1 = open_file(DATAFILE_PATH, mode = 'r')

object = fileh1.get_node('/train', 'gaps')
object_array_data = object.read()
fileh1.close()

# Convert to vectorized array that we can use in further processing.
all_data = np.zeros((object_array_data.shape[0], len(fields)))
print 'there are {} rows'.format(object_array_data.shape[0])
for rcounter, row in enumerate(object_array_data):
  for fcounter, field in enumerate(fields):
    all_data[rcounter, fcounter] = row[field]
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)
all_data_original = np.copy(all_data)

there are 102592 rows
processed 0 rows
processed 5000 rows
processed 10000 rows
processed 15000 rows
processed 20000 rows
processed 25000 rows
processed 30000 rows
processed 35000 rows
processed 40000 rows
processed 45000 rows
processed 50000 rows
processed 55000 rows
processed 60000 rows
processed 65000 rows
processed 70000 rows
processed 75000 rows
processed 80000 rows
processed 85000 rows
processed 90000 rows
processed 95000 rows
processed 100000 rows


# One Hot Encoding

One hot encoding is a process of converting categorical data into binary i.e. from `day_in_week` to `day_in_week_0`, `day_in_week_1`, ... , `day_in_week_6`, each contains value 0 or 1. The former version cannot be used for machine learning as the numerical data were meaningless e.g. Sunday is 0 and Tuesday is 2, but it doesn't mean that Tuesday is "higher" or "larger" in mathematical sense.

In [11]:
from sklearn.preprocessing import OneHotEncoder

print "np.nan=", np.where(np.isnan(all_data_original))

# Impute all NaN with numbers
all_data[np.isnan(all_data)] = 0

data = all_data[:,1:]
targets = all_data[:,0]

one_hot = OneHotEncoder(categorical_features=[0, 1, 2, 3, 4], sparse=False)
one_hot.fit(data)

np.nan= (array([  1343,   1343,   1343, ..., 102591, 102591, 102591]), array([ 3,  4,  5, ..., 21, 22, 23]))


OneHotEncoder(categorical_features=[0, 1, 2, 3, 4], dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [12]:
one_hot.n_values_

array([145,   7,  10,  10,  10])

In [13]:
one_hot.feature_indices_

array([  0, 145, 152, 162, 172, 182])

In [14]:
data = one_hot.transform(data_original)
n_features = data.shape[1]
print 'new number of features: {}'.format(n_features)

new number of features: 336


# Scorer Creation (MAPE)

In [15]:
def mape(y, predictions):
#   num_timeslots = 43
#   num_districts = 66
  if len(y.shape) == 1:
    y = np.asmatrix(y)
  if len(predictions.shape) == 1:
    predictions = np.asmatrix(predictions)
  y = y.astype(float)
  predictions = predictions.astype(float)
  return np.mean(np.absolute((y-predictions)/y))

# from keras import backend as K

# def mape(y, predictions):
#   return K.mean(K.abs(y-predictions/K.clip(K.abs(y), K.epsilon(), np.inf)), axis=-1)

mape_scorer = make_scorer(mape, greater_is_better=False)

Testing MAPE

In [16]:
from sklearn.linear_model import LogisticRegression

predictions = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
y = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')

# Should return 0.0
print mape(y, predictions)

# Should return higher score
predictions = np.array([1.0, 2.0, 2.0, 3.0]).astype('float32')
print(mape(y, predictions))

# Should return highest score
predictions = np.array([1000.0, 22.0, 11.0, 31.0]).astype('float32')
print(mape(y, predictions))

# est = LogisticRegression()
# X = np.random.rand(10,4)
# y = X.sum(axis=1)
# est.fit(X, y)
# predictions = est.predict(X)
# print(mape(y, predictions))

0.0
0.145833333333
254.604166667


# Building and Testing Algorithm(s)

In [17]:
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.sparse import coo_matrix, hstack

class CustomRegressor(BaseEstimator, RegressorMixin):
  def __init__(self):
    pass

  def fit(self, X, y):
#     self.classes_, indices = np.unique(["foo", "bar", "foo"],
#                                     return_inverse=True)
#     self.majority_ = np.argmax(np.bincount(indices))
    return self

  def predict(self, X):
    # 56: gap_1_slots_ago
    # 58: gap_2_slots_ago
    # 60: gap_3_slots_ago
#     X = X.tocsr()
#     v1 = coo_matrix(np.asmatrix(np.ones(X.shape[0])).T)
    v1 = np.asmatrix(np.ones(X.shape[0]))
    v2 = np.asmatrix((X[:, 23]*0.65+X[:, 25]*0.25+X[:, 27]*0.15)/2)
    predictions = np.asarray(np.concatenate((v1, v2), axis=0).max(axis=0))
    
    return predictions
  
custom_est = CustomRegressor()
custom_est.fit(data, targets)
custom_predictions = custom_est.predict(data)
print(mape(targets, custom_predictions))

0.519787107909


In [18]:
all_data = np.copy(all_data_original)
data = all_data[:,1:]
targets = all_data[:,0]

from sklearn.cross_validation import KFold
from sklearn.preprocessing import Imputer, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

steps = [
  # n_values needs to be set to avoid problems where not all classes exist in training data.
  ('one_hot', OneHotEncoder(categorical_features=[0, 1, 2, 3, 4], sparse=False,
                            n_values=[145, 7, 10, 10, 10])),
  ('estimate', DecisionTreeRegressor(random_state=seed))
]


folds = KFold(len(targets), n_folds=10, shuffle=True, random_state=seed)
scores = []
for fold, (train_ids, test_ids) in enumerate(folds):
  print "Now training fold {}".format(fold+1)
  est = Pipeline(steps)
  est.fit(data[train_ids], targets[train_ids])
  preds = est.predict(data[test_ids])
  score = mape(targets[test_ids], preds)
  scores.append(score)
  print "Score: {}".format(score)

Now training fold 1
Score: 1.27281437221
Now training fold 2
Score: 1.3630237399
Now training fold 3
Score: 1.42335299024
Now training fold 4
Score: 1.42799111235
Now training fold 5
Score: 1.45308008865
Now training fold 6
Score: 1.34971090885
Now training fold 7
Score: 1.50255585105
Now training fold 8
Score: 1.3823985588
Now training fold 9
Score: 1.39588625226
Now training fold 10
Score: 1.38658224032


In [19]:
print "Mean score: {}".format(sum(scores)/len(scores))

Mean score: 1.39573961147
