In [1]:
###################################################################
# Script:
#    trainHoliday.py
# Usage:
#    python trainHoliday.py <input_file> <pass1_file> <output_file>
# Description:
#    Build the prediction model based on training data
#    Pass 2: prediction based on holiday info
# Authors:
#    Jasmin Nakic, jnakic@salesforce.com
#    Samir Pilipovic, spilipovic@salesforce.com
###################################################################
import sys
import numpy as np
from sklearn import linear_model
from sklearn.externals import joblib

# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
# Script debugging flag
debugFlag = False

# Feature list for holiday hours
hourHolidayCols  = ["isHoliday",
                    "isHour0", "isHour1", "isHour2", "isHour3", "isHour4", "isHour5", "isHour6", "isHour7",
                    "isHour8", "isHour9", "isHour10", "isHour11", "isHour12", "isHour13", "isHour14", "isHour15",
                    "isHour16", "isHour17", "isHour18", "isHour19", "isHour20", "isHour21", "isHour22", "isHour23"]

In [3]:
# Add columns to the existing array and populate with data
def addColumns(dest, src, colNames):
    # Initialize temporary array
    tmpArr = np.empty(src.shape[0])
    cols = 0
    # Copy column content
    for name in colNames:
        if cols == 0: # first column
            tmpArr = np.copy(src[name])
            tmpArr = np.reshape(tmpArr,(-1,1))
        else:
            tmpCol = np.copy(src[name])
            tmpCol = np.reshape(tmpCol,(-1,1))
            tmpArr = np.append(tmpArr,tmpCol,1)
        cols = cols + 1
    return np.append(dest,tmpArr,1)
#end addColumns

# Generate linear regression model
def genModel(rawData,calcData,modelName):
    # Initialize array
    X = np.zeros(rawData.shape[0])
    X = np.reshape(X,(-1,1))

    # Add columns for holidays by hour
    X = addColumns(X,rawData,hourHolidayCols)

    X[:, 2] = rawData["isHoliday"]*rawData["isHour0"]
    X[:, 3] = rawData["isHoliday"]*rawData["isHour1"]
    X[:, 4] = rawData["isHoliday"]*rawData["isHour2"]
    X[:, 5] = rawData["isHoliday"]*rawData["isHour3"]
    X[:, 6] = rawData["isHoliday"]*rawData["isHour4"]
    X[:, 7] = rawData["isHoliday"]*rawData["isHour5"]
    X[:, 8] = rawData["isHoliday"]*rawData["isHour6"]
    X[:, 9] = rawData["isHoliday"]*rawData["isHour7"]
    X[:,10] = rawData["isHoliday"]*rawData["isHour8"]
    X[:,11] = rawData["isHoliday"]*rawData["isHour9"]
    X[:,12] = rawData["isHoliday"]*rawData["isHour10"]
    X[:,13] = rawData["isHoliday"]*rawData["isHour11"]
    X[:,14] = rawData["isHoliday"]*rawData["isHour12"]
    X[:,15] = rawData["isHoliday"]*rawData["isHour13"]
    X[:,16] = rawData["isHoliday"]*rawData["isHour14"]
    X[:,17] = rawData["isHoliday"]*rawData["isHour15"]
    X[:,18] = rawData["isHoliday"]*rawData["isHour16"]
    X[:,19] = rawData["isHoliday"]*rawData["isHour17"]
    X[:,20] = rawData["isHoliday"]*rawData["isHour18"]
    X[:,21] = rawData["isHoliday"]*rawData["isHour19"]
    X[:,22] = rawData["isHoliday"]*rawData["isHour20"]
    X[:,23] = rawData["isHoliday"]*rawData["isHour21"]
    X[:,24] = rawData["isHoliday"]*rawData["isHour22"]
    X[:,25] = rawData["isHoliday"]*rawData["isHour23"]

    Xnoholiday = np.zeros(rawData.shape[0])
    Xnoholiday = (1-rawData["isHoliday"])*calcData["predHourWeek"]
    Xnoholiday = np.reshape(Xnoholiday,(-1,1))
    X = np.append(X,Xnoholiday,1)

    if debugFlag:
        print("X 0: ", X[0:5])

    Y = np.copy(rawData["cnt"])
    if debugFlag:
        print("Y 0: ", Y[0:5])

    model = linear_model.LinearRegression()
    print(model.fit(X, Y))

    print("INTERCEPT: ", model.intercept_)
    print("COEFFICIENT shape: ", model.coef_.shape)
    print("COEFFICIENT values: ", model.coef_)
    print("SCORE values: ", model.score(X,Y))

    P = model.predict(X)
    if debugFlag:
        print("P 0-5: ", P[0:5])
    joblib.dump(model,modelName)
    return P
#end genModel

# Write predictions to the output file
def writeResult(output,rawData,calcData,p5):
    # generate result file
    result = np.array(
        np.empty(rawData.shape[0]),
        dtype=[
            ("timeStamp","|U19"),
            ("dateFrac",float),
            ("isHoliday",int),
            ("isSunday",int),
            ("cnt",int),
            ("predSimple",int),
            ("predTrig",int),
            ("predHourDay",int),
            ("predHourWeek",int),
            ("predHoliday",int)
        ]
    )

    result["timeStamp"]    = rawData["timeStamp"]
    result["dateFrac"]     = rawData["dateFrac"]
    result["isHoliday"]    = rawData["isHoliday"]
    result["isSunday"]     = rawData["isSunday"]
    result["cnt"]          = rawData["cnt"]
    result["predSimple"]   = calcData["predSimple"]
    result["predTrig"]     = calcData["predTrig"]
    result["predHourDay"]  = calcData["predHourDay"]
    result["predHourWeek"] = calcData["predHourWeek"]
    result["predHoliday"]  = p5

    if debugFlag:
        print("R 0-5: ", result[0:5])
    hdr = "timeStamp\tdateFrac\tisHoliday\tisSunday\tcnt\tpredSimple\tpredTrig\tpredHourDay\tpredHourWeek\tpredHoliday"
    np.savetxt(output,result,fmt="%s",delimiter="\t",header=hdr,comments="")
#end writeResult

In [4]:
# Start
inputFileName = "train_data.txt"
hourlyFileName = "train_hourly.txt"
outputFileName = "train_holiday.txt"

# All input columns - data types are strings, float and int
inputData = np.genfromtxt(
    inputFileName,
    delimiter='\t',
    names=True,
    dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int
    )
)

# timeStamp dateFrac isHoliday isSunday cnt predSimple predTrig predHourDay predHourWeek
hourlyData = np.genfromtxt(
    hourlyFileName,
    delimiter='\t',
    names=True,
    dtype=("|U19",float,int,int,int,int,int,int,int)
)

PH = genModel(inputData,hourlyData,"modelHoliday")
writeResult(outputFileName,inputData,hourlyData,PH)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
INTERCEPT:  -1644.67848789
COEFFICIENT shape:  (27,)
COEFFICIENT values:  [ -9.07443153e-08   8.22330113e+04  -6.92253329e+04  -7.44743329e+04
  -7.56523329e+04  -6.19403329e+04  -4.46243329e+04  -2.41263329e+04
   2.00796671e+04   6.31826671e+04   8.11896671e+04   5.73686671e+04
   4.65396671e+04   5.12396671e+04   4.02966671e+04   3.74696671e+04
   4.93636671e+04   5.07696671e+04   6.00396671e+04   4.11816671e+04
   2.70976671e+04   7.65667138e+02  -2.50443329e+04  -4.10523329e+04
  -5.90653329e+04  -6.91463329e+04   1.02395474e+00]
SCORE values:  0.952022555876


In [5]:
# Load results from file generated above using correct data types
results = np.genfromtxt(
    outputFileName,
    dtype=("|U19",float,int,int,int,int,int,int,int,int),
    delimiter='\t',
    names=True
)

In [6]:
# Examine result data
print("Shape:", results.shape)
print("Columns:", len(results.dtype.names))
print(results[1:5])

Shape: (1225,)
Columns: 10
[ ('2016-05-08 19:00:00', 736092.791667, 0, 1, 69991, 192424, 66256, -54979, 61128, 60947)
 ('2016-05-08 20:00:00', 736092.833333, 0, 1, 50998, 192446, 10508, -90206, 40313, 39634)
 ('2016-05-08 21:00:00', 736092.875, 0, 1, 38152, 192467, -36902, -108893, 25947, 24923)
 ('2016-05-08 22:00:00', 736092.916667, 0, 1, 23062, 192489, -72744, -125759, 11668, 10302)]


In [9]:
# Generate chart with predicitons based on training data (using plotly)
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)

set1 = go.Bar(
    x=results["dateFrac"],
    y=results["predHourWeek"],
#    marker=dict(color='blue'),
    name='HourWeek'
)
set2 = go.Bar(
    x=results["dateFrac"],
    y=results["predHoliday"],
#    marker=dict(color='crimson'),
    opacity=0.6,
    name='Holiday'
)
barData = [set1, set2]
barLayout = go.Layout(barmode='group', title="Prediction vs. Actual")

fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)

Plotly version 2.0.8
