In [1]:
#######################################################
# Script:
#    testRandomForest.py
# Usage:
#    python testRandomForest.py
# Description:
#    Test the prediction model using test data set
# Authors:
#    Jackie Chu,   cchu@salesforce.com
#    Jasmin Nakic, jnakic@salesforce.com
#######################################################

import sys
import numpy as np
from sklearn import tree
from sklearn import ensemble
from sklearn.externals import joblib

# Enable or disable debug printing
debugFlag = True

# Feature list
perfCols = ["PageTime_ms","TotalServerTime_ms","TotalBrowserTime_ms","Action_count","Api_count","Db_count","DbTime_ms","Xhr_count"]

In [2]:
def addColumns(dest, src, colNames):
    # Initialize temporary array
    tmpArr = np.empty(src.shape[0])
    cols = 0
    # Copy column content
    for name in colNames:
        if cols == 0: # first column
            tmpArr = np.copy(src[name])
            tmpArr = np.reshape(tmpArr,(-1,1))
        else:
            tmpCol = np.copy(src[name])
            tmpCol = np.reshape(tmpCol,(-1,1))
            tmpArr = np.append(tmpArr,tmpCol,1)
        cols = cols + 1
    return np.append(dest,tmpArr,1)
#end addColumns

def getPredictions(data,colList,modelName):
    # Prepare the data for the model
    X = np.zeros(data.shape[0])
    X = np.reshape(X,(-1,1))
    X = addColumns(X,data,colList)
    if debugFlag:
        print("X 0: ", X[0:5])
    Y = np.copy(data["Status"])
    if debugFlag:
        print("Y 0: ", Y[0:5])

    modelFileName = modelName+".model"
    model = joblib.load(modelFileName)

    print("MODEL: ", model)
    print("NAMES: ", data.dtype.names)
    print("FEATURE_IMPORTANCES: ", model.feature_importances_)
    print("N_FEATURES: ", model.n_features_)
    print("N_OUTPUTS: ", model.n_outputs_)
    print("OOB_DECISION_FUNCTION: ", model.oob_decision_function_)
    print("OOB_SCORE: ", model.oob_score_)

    P = model.predict(X)
    print("SCORE values: ", model.score(X,Y))
    if debugFlag:
        print("P 0-5: ", P[0:5])

    return P
#end getPredictions

In [4]:
def writeResult(output,data,p):
    result = np.array(
       np.empty(data.shape[0]),
       dtype=[
           ("Page","|U20"),
           ("PageTime_ms",int),
           ("TotalServerTime_ms",int),
           ("TotalBrowserTime_ms",int),
           ("Action_count",int),
           ("Api_count",int),
           ("Db_count",int),
           ("DbTime_ms",int),
           ("Xhr_count",int),
           ("Status","|U20"),
           ("PREDICTION","|U20")
        ]
    )
    result["PageTime_ms"]     = data["PageTime_ms"]
    result["TotalServerTime_ms"]     = data["TotalServerTime_ms"]
    result["TotalBrowserTime_ms"]     = data["TotalBrowserTime_ms"]
    result["Action_count"]     = data["Action_count"]
    result["Api_count"]    = data["Api_count"]
    result["Db_count"]    = data["Db_count"]
    result["DbTime_ms"]    = data["DbTime_ms"]
    result["Xhr_count"]    = data["Xhr_count"]
    result["Status"] = data["Status"]
    result["PREDICTION"] = p
    hdr = "PageTime_ms,TotalServerTime_ms,TotalBrowserTime_ms,Action_count,Api_count,Db_count,DbTime_ms,Xhr_count,Status,PREDICTION"
    if debugFlag:
        print(hdr)
        print("R 0-5: ", result[0:5])
    np.savetxt(output,result,fmt="%s",delimiter=",",header=hdr,comments="")
#end writeResult

In [5]:
# Start
inputFileName = "PerfRun_TestData.csv"
outputFileName = "PerfRun_TestResult.txt"
modelName = "PerfRandomForest"

# All input columns - data types are strings, float and int
testData = np.genfromtxt(
    inputFileName,
    delimiter=',',
    names=True,
    dtype=("|U20",int,int,int,int,int,int,int,int)
)
if debugFlag:
    print("testData : ", testData[0:5])

testData :  [('Success', 2198, 2041, 1709, 13, 52, 289, 502, 10)
 ('Success', 2204, 2023, 1679, 13, 50, 289, 525, 10)
 ('Invalid', 2723, 2182, 1944, 13, 50, 331, 472, 10)
 ('Success', 2402, 2178, 1716, 13, 52, 293, 511, 10)
 ('Success', 2323, 2199, 1675, 13, 54, 290, 491, 10)]


In [6]:
# Get Prediction for PerfRun_TestData.csv
P = getPredictions(testData,perfCols,modelName)

X 0:  [[   0. 2198. 2041. 1709.   13.   52.  289.  502.   10.]
 [   0. 2204. 2023. 1679.   13.   50.  289.  525.   10.]
 [   0. 2723. 2182. 1944.   13.   50.  331.  472.   10.]
 [   0. 2402. 2178. 1716.   13.   52.  293.  511.   10.]
 [   0. 2323. 2199. 1675.   13.   54.  290.  491.   10.]]
Y 0:  ['Success' 'Success' 'Invalid' 'Success' 'Success']
MODEL:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)
NAMES:  ('Status', 'PageTime_ms', 'TotalServerTime_ms', 'TotalBrowserTime_ms', 'Action_count', 'Api_count', 'Db_count', 'DbTime_ms', 'Xhr_count')
FEATURE_IMPORTANCES:  [0.         0.38268979 0.16536471 0.14094058 0.03812361 0.03793318

In [10]:
# Write result to file PerfRun_TestResult.txt
writeResult(outputFileName,testData,P)

PageTime_ms,TotalServerTime_ms,TotalBrowserTime_ms,Action_count,Api_count,Db_count,DbTime_ms,Xhr_count,Status,PREDICTION
R 0-5:  [('0.0', 2198, 2041, 1709, 13, 52, 289, 502, 10, 'Success', 'Success')
 ('4.4e-323', 2204, 2023, 1679, 13, 50, 289, 525, 10, 'Success', 'Success')
 ('0.0', 2723, 2182, 1944, 13, 50, 331, 472, 10, 'Invalid', 'Invalid')
 ('nan', 2402, 2178, 1716, 13, 52, 293, 511, 10, 'Success', 'Invalid')
 ('9.0', 2323, 2199, 1675, 13, 54, 290, 491, 10, 'Success', 'Success')]
