In [2]:
'''
Notebook to Calculate Baseline using a simple average predictor.
'''
import numpy as np
import pandas as pd
import pickle

In [4]:
'''
Utility functions.
'''
# Author: Alex Wang -- Sets NaN to average.
def normalize_features(X_train):
    mean_X_train = np.nanmean(X_train, 0)
    for i in xrange(np.shape(X_train)[1]):
        col = X_train[:,i]
        col[ np.isnan(col) ] = mean_X_train[i]
    std_X_train = np.std(X_train, 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    return X_train_normalized

def rmse(predict, true):
    # Returns the root mean squared error.
    return np.sqrt(1.0/np.shape(predict)[0] * np.sum(np.square(predict - true)))

def randomSplit(X, split_size):
    # Randomly splits the data.
    np.random.shuffle(X)
    break_pt = split_size * np.shape(X)[0]
    return X[:break_pt,:], X[break_pt:,:]

def buckets(series, n):
    # Takes a series and returns an array mapping each element to
    # one of n buckets.
    mi, ma = series.min(), series.max()
    buckets = np.linspace(mi, ma, n + 1)
    
    res = np.zeros(len(series))
    array = series.values
    if np.isnan(array).any():
        print "Error! NaN values found in series!"
    for i in xrange(n):
        res[(buckets[i] <= array) & (array < buckets[i+1])] = i + 1
    return res

def createSimplePartitions(data, n):
    # Returns a partitioned version of data into nxn regions!
    data['xRegion'] = buckets(data.Latitude, n)
    data['yRegion'] = buckets(data.Longitude, n) + 1
    data['Region'] = n * data.yRegion + data.xRegion

    return data

def extractDataMatrix(data, n):
    # Creates a NxD data matrix from the given data set.
    # data must contains xRegion, yRegion, Region, and TimeFeature columns.
    # 1 -> xRegion
    # 2 -> yRegion
    # 3 -> Region
    # 4 -> Month
    # 5 -> Count
    # The data is NOT normalized!
    partData = createSimplePartitions(data, n)
    regions = partData.Regions.unique()
    months = partData.TimeFeature.unique()
    num_columns = 5
    X_data = np.zeros((regions * months, num_columns))
    el = 0
    for region in regions:
        for month in months:
            count = len(data[ (data.Region == region) &
                              (data.TimeFeature == month)])
            if count > 0:
                np.zeros[el, :] = np.array([data.xRegion[0],
                                            data.yRegion[0],
                                            region, month, count])
            
    if el < X_data.shape[0]:
        print "Removing empty values from our data!"
        print "Rows before: {}".format(X_data.shape[0])
        X_data = X_data[~np.all(X_data == 0, axis=1)]
        print "Rows after: {}".format(X_data.shape[0])