In [3]:
'''
Notebook to Calculate Baseline using a simple average predictor.
'''
import numpy as np
import pandas as pd
import pickle

In [72]:
'''
Utility functions.
'''
# Mapping for data matrix columns.
columns = { 'x' : 0,
            'y' : 1,
            'region' : 2,
            't' : 3, 
            'count' : 4 } 

# Author: Alex Wang -- Sets NaN to average.
def normalize_features(X_train):
    mean_X_train = np.nanmean(X_train, 0)
    for i in xrange(np.shape(X_train)[1]):
        col = X_train[:,i]
        col[ np.isnan(col) ] = mean_X_train[i]
    std_X_train = np.std(X_train, 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    return X_train_normalized

def rmse(predict, true):
    # Returns the root mean squared error.
    return np.sqrt(1.0/np.shape(predict)[0] * np.sum(np.square(predict - true)))

def randomSplit(X, split_size):
    # Randomly splits the data.
    np.random.shuffle(X)
    break_pt = int(split_size * np.shape(X)[0])
    return X[:break_pt,:], X[break_pt:,:]

def splitLastN(X, t):
    # Splits the X data matrix into historical data and data for the 
    # last t time steps.
    times = np.unique(X[:, columns['t']])
    lowBound = np.sort(times)[len(times) - t]
    selected = X[:, columns['t']] <= lowBound
    return X[selected,:], X[~selected,:]

def buckets(series, n):
    # Takes a series and returns an array mapping each element to
    # one of n buckets.
    mi, ma = series.min(), series.max()
    buckets = np.linspace(mi, ma, n + 1)
    
    res = np.zeros(len(series))
    array = series.values
    if np.isnan(array).any():
        print "Error! NaN values found in series!"
    for i in xrange(n):
        res[(buckets[i] <= array) & (array < buckets[i+1])] = i
    return res.astype(int)

def createSimplePartitions(data, n):
    # Returns a partitioned version of data into nxn regions!
    data['xRegion'] = buckets(data.Latitude, n).astype(int)
    data['yRegion'] = buckets(data.Longitude, n).astype(int)
    data['Region'] = n * data.xRegion + data.yRegion

    return data

def extractDataMatrix(data, n):
    # Creates a NxD data matrix from the given data set.
    # data must contains xRegion, yRegion, Region, and TimeFeature columns.
    # 0 -> xRegion
    # 1 -> yRegion
    # 2 -> Region
    # 3 -> Month
    # 4 -> Count
    # The data is NOT normalized!
    # Returns the data as well as a dictionary mapping column names
    # to indeces.
    partData = createSimplePartitions(data, n)
    regions = partData.Region.unique()
    months = partData.TimeFeature.unique()
    num_columns = 5
    num_rows = len(regions) * len(months)
    X_data = np.zeros((num_rows, num_columns))
    el = 0
    for region in regions:
        for month in months:
            tmp = data[ (data.Region == region) &
                        (data.TimeFeature == month)]
            # print tmp
            count = len(tmp)
            if count > 0:
                X_data[el, :] = np.array([tmp.xRegion.iloc[0],
                                            tmp.yRegion.iloc[0],
                                            region, month, count])
                el += 1
            
    if el < X_data.shape[0]:
        print "Removing empty values from our data!"
        print "Rows before: {}".format(X_data.shape[0])
        X_data = X_data[~np.all(X_data == 0, axis=1)]
        print "Rows after: {}".format(X_data.shape[0])
        
    return X_data.astype(int)

In [80]:
'''
More utility functions. 
'''
# Note: If a data point does not exist, it is assumed to be 0.
def averagePredictions(X_train):
    # Returns an array indexed by region with the average over the 
    # training set for each region.
    tMax = X_train[:, columns['t']].max()
    nRegions = X_train[:, columns['region']].max() + 1
    
    averages = np.zeros(nRegions)
    for region in xrange(nRegions):
        averages[region] = X_train[
            X_train[:, columns['region']] == region,
            columns['count']].sum() / float(tMax)
    return averages

def createHeatMap(X):
    '''
    Given a data set, creates a heatmap of it based on x,y coordinates.
    Ignore the temporal feature. You should subset the data before passing
    it into this function if you'd like a heatmap for a specific time period.
    '''
    n = X[:, columns['x']].max()
    m = X[:, columns['y']].max()
    heatmap = np.zeros((n,m))
    for i in xrange(n):
        for j in xrange(m):
            total = X[:, (X[:, columns['x']] == i) & 
                         (X[:, columns['y']] == j)].sum()
            if total > 0:
                heatmap[i,j] = total
                
    return heatmap

In [81]:
# Given a value of n:
# 0. Normalize the data (if set to True)
# 1. Partition the data
# 2. Split into Train/Test, where test has lastN time feats.
#    Options: 'random', 'last'
#    splitRatio specifies the ratio of results to keep for testing.
#    testPeriods specifies the number of time periods to test
# 3. Train the averages
# 4. Test on the hold-out
# 5. Calculate RMSE
def averageModel(n, X_data, normalize = False, splitMethod = 'random', splitRatio = 0.1, testPeriods = 12):
    if normalize:
        X_data = normalize_features(X_data)
        print "Normalized data features!"
        sys.stdout.flush()
    if splitMethod == 'random':
        X_train, X_test = randomSplit(X_data, splitRatio)
    elif splitMethod == 'last':
        X_train, X_test = splitLastN(X_data, testPeriods)
    else:
        raise Exception("splitMethod {} unsupported".format(splitMethod))
    
    print "Training model..."
    sys.stdout.flush()
    # Now use training data to calculate averages
    model = averagePredictions(X_train)
    print model
    
    # Generate predictions vector
    predict = model[X_test[:, columns['region']]]
    true = X_test[:, columns['count']]
    
    print "Calculating RMSE..."
    sys.stdout.flush()
    
    return rmse(predict, true)

In [76]:
# Let's make a plot for some values of N to see if the data works out...
sfdata_file = '../../cs281_data/large_data/sfclean.pk'
with open(sfdata_file) as fp:
    sfdata = pickle.load(fp)
    # For sfdata, need to remove outliers
    sfdata = sfdata[-120 > sfdata.Longitude][sfdata.Longitude > (-130)]
    sfdata = sfdata[sfdata.Latitude > 37][sfdata.Latitude < 40]

In [None]:
import sys
testN = range(2,10) + range(10,100,10)
rmses = []
for n in testN:
    print "n = {}".format(n)
    X_data = extractDataMatrix(sfdata, n)
    # print X_data.dtype
    print "Partitioned data..."
    sys.stdout.flush()
    rmse_random = averageModel(n, X_data)
    print "Random RMSE: {}".format(rmse_random)
    sys.stdout.flush()
    rmse_last = averageModel(n, X_data, splitMethod='last')
    rmses.append((rmse_random, rmse_last))
    print "Last RMSE: {}".format(rmse_last)
    sys.stdout.flush()

n = 2
Partitioned data...
Training model...
[ 115.30519481  259.73376623  119.20779221  771.88311688]
Calculating RMSE...
Random RMSE: 3251.36380185
Training model...
[ 1246.30769231  3133.09090909  1070.82517483  6395.87412587]
Calculating RMSE...
Last RMSE: 515.774742515
n = 3
Partitioned data...
Training model...
[  39.70198675   89.94039735  100.84768212   85.28476821  358.05298013
  196.98675497    6.74172185   67.94701987  171.62251656]
Calculating RMSE...
Random RMSE: 1447.17106439
Training model...
[  387.71328671  1062.27272727  1052.53846154   647.58741259  2868.0979021
  2259.36363636    46.18881119  1507.67132867  2014.66433566]
Calculating RMSE...
Last RMSE: 215.41729545
n = 4
Partitioned data...
Training model...
[   8.9025974    64.07142857   39.81168831   63.           40.35064935
   32.03896104  164.33116883   33.80519481   13.44155844  100.99350649
  407.71428571   62.63636364    0.            8.12987013   84.61038961
   17.85064935]
Calculating RMSE...
Random RMSE: 1