In [1]:
from tables import *
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'GradientBoostingRegressor_grid_best.pkl'

seed = 13
np.random.seed(seed)

# Put all categorical data first for easier implementation of One Hot Encoding.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

datafile_path = 'xjk_pytable.h5'

In [6]:
fileh = open_file(datafile_path, mode = 'r')
try:
  fileh.root.train.gaps
  print "file created, pass..."
except:
  fileh.close()
  fileh = open_file(datafile_path, mode = 'w')
  train = fileh.create_group('/', 'train', 'Training tables')
  tabledef = {}
  for field in fields:
    tabledef[field] = Float64Col()
  gaps = fileh.create_table(train, 'gaps', tabledef)
  gaps.flush()
fileh.close()

file created, pass...


# Feature Selection

In [None]:
print ', '.join(fields)

In [7]:
%%sql --module q_all

SELECT gap, day_in_week, weather_1_slots_ago, weather_2_slots_ago, weather_3_slots_ago,
  IF(timeofday_slot >= 50 AND timeofday_slot <= 53, 1, 0) AS busy_time,
  tj_level1_1_slots_ago, tj_level2_1_slots_ago, tj_level3_1_slots_ago, tj_level4_1_slots_ago,
  tj_level1_2_slots_ago, tj_level2_2_slots_ago, tj_level3_2_slots_ago, tj_level4_2_slots_ago,
  tj_level1_3_slots_ago, tj_level2_3_slots_ago, tj_level3_3_slots_ago, tj_level4_3_slots_ago,
  temperature_1_slots_ago, pm25_1_slots_ago, temperature_2_slots_ago, pm25_2_slots_ago,
  temperature_3_slots_ago, pm25_3_slots_ago, gap_1_slots_ago, sum_price_1_slots_ago,
  gap_2_slots_ago, sum_price_2_slots_ago, gap_3_slots_ago, sum_price_3_slots_ago, f1, f11, f11_1,
  f11_2, f11_3, f11_4, f11_5, f11_6, f11_7, f11_8, f13_4, f13_8, f14, f14_1, f14_10, f14_2, f14_3,
  f14_6, f14_8, f15, f15_1, f15_2, f15_3, f15_4, f15_6, f15_7, f15_8, f16, f16_1, f16_10, f16_11,
  f16_12, f16_3, f16_4, f16_6, f17, f17_2, f17_3, f17_4, f17_5, f19, f19_1, f19_2, f19_3, f19_4,
  f1_1, f1_10, f1_11, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7, f1_8, f20, f20_1, f20_2, f20_4, f20_5,
  f20_6, f20_7, f20_8, f20_9, f21_1, f21_2, f22, f22_1, f22_2, f22_3, f22_4, f22_5, f23, f23_1,
  f23_2, f23_3, f23_4, f23_5, f23_6, f24, f24_1, f24_2, f24_3, f25, f25_1, f25_3, f25_7, f25_8,
  f25_9, f2_1, f2_10, f2_11, f2_12, f2_13, f2_2, f2_4, f2_5, f2_6, f2_7, f2_8, f3_1, f3_2, f3_3,
  f4, f4_1, f4_10, f4_11, f4_13, f4_14, f4_16, f4_17, f4_18, f4_2, f4_3, f4_5, f4_6, f4_7, f4_8,
  f4_9, f5, f5_1, f5_3, f5_4, f6, f6_1, f6_2, f6_4, f7, f8, f8_1, f8_2, f8_3, f8_4, f8_5
FROM [datalab-projects-1331:xjk_algo_comp.gaps]
WHERE gap > 0
LIMIT 1

In [None]:
%%timeit -n 1 -r 1

fileh = open_file(datafile_path, mode = 'r')
gaps_table = fileh.root.train.gaps

if gaps_table.nrows == 0:
  query = bq.Query(q_all)
  tableresult = query.results()

  gap = gaps_table.row
  print 'there are {} rows'.format(tableresult.length)
  for rcounter, row in enumerate(tableresult):
    for field in fields:
      gap[field] = row[field]
    gap.append()
    if rcounter % 5000 == 0:
      print 'processed {} rows'.format(rcounter)
  gaps_table.flush()
      
object = fileh.get_node('/train', 'gaps')
all_data = object.read()
all_data_original = np.copy(all_data)
fileh.close()

In [None]:
print(len(fields))
print(len(row.values()))
print(row.keys())
for key in row.keys():
  print "checking", key
  if key not in fields:
    print "does not exist"

In [None]:
# This chunk does further wrangling to dataset to produce training and test sets.

# Useful code to check NaN and Inf values. This is needed since these values would
# cause "Input contains NaN, infinity or a value too large for dtype('float32')
# errors when left unchecked.
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data_original))
print "is.inf=", np.where(np.isinf(all_data_original))
print "np.max=", np.max(abs(all_data_original))

# Impute all NaN with numbers (not sure what to replace inf yet)
all_data[np.isnan(all_data_original)] = 0
# all_data[np.isinf(all_data)] = 0

# See that NaN and Inf values replaced
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Split the data into train and test sets.
data_size = all_data.shape[0]
training_size = data_size * 80/100
indices = np.random.permutation(data_size)
training_idx, test_idx = indices[:training_size], indices[training_size:]
all_data_train, all_data_test = all_data[training_idx,:], all_data[test_idx,:]

data_train = all_data_train[:,1:]
targets_train = all_data_train[:,0]
data_test = all_data_test[:,1:]
targets_test = all_data_test[:,0]
data_train_original = np.copy(data_train)
data_test_original = np.copy(data_test)

# Data Exploration - Find NaN values

In [None]:
# This is how to get position of NaNs

nulls = np.isnan(all_data_original)
nullspos = np.column_stack(np.where(nulls==True))
nullspos

In [None]:
x = [[np.NaN, 1, 2, 3],
     [1, 2, 3, np.NaN]]
xn = np.isnan(x)
xnp = np.column_stack(np.where(xn==True))
xnp

In [None]:
import pandas as pd
from operator import itemgetter
print "total data points:", (all_data_original.shape[0] * all_data_original.shape[1])
print "number of missing values:", nullspos.shape[0]
missing_features = itemgetter(*np.unique(nullspos[:,1]).tolist())(fields)
missing_features_table = pd.DataFrame(columns=['id', 'field', 'missing data points'])

for id, field in enumerate(fields):
  total_missing = len(np.where(nullspos[:,1]==id)[0])
  if total_missing > 0:
    missing_features_table = missing_features_table.append({
        'id': id,
        'field': field,
        'missing data points': total_missing
      }, ignore_index=True)
missing_features_table['missing data points'] = \
  missing_features_table['missing data points'].astype('int64')
missing_features_table['id'] = \
  missing_features_table['id'].astype('int64')
missing_features_table.sort_values(['missing data points', 'id'], ascending=[False, True])

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15.0, 9.0)
def rand_jitter(arr):
    stdev = .005*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev
_ = plt.scatter(nullspos[:,0], rand_jitter(nullspos[:,1]), s=0.5)
_ = plt.title('Missing Data Points')
_ = plt.ylabel('Feature ID')
_ = plt.xlabel('Observation ID')

In [None]:
%%sql
SELECT COUNT(*) AS count FROM [datalab-projects-1331:xjk_algo_comp.gaps]
WHERE IS_NAN(sum_price_1_slots_ago) = true
AND gap > 0