In [4]:
import pandas as pd
import numpy as np
import datetime
from itertools import product

**Must haves for this notebook to work**
* yTrain and yTest files
* columns labeled EquipmentID, EventTimeStamp, target, prediction for both files
* target should be a column of 1's and 0's
* prediction should be a numerical value between 1 and 0

*Once above is done then you need to move your training file and test file into the data folder within the folder the notebook is in.*

*Finally replace 'YOUR_X_FILE_NAME_HERE' with training in first cell and test in the second. Keep the '' around your file name*

*Once done you can run all cells.*

**Training read in**

In [5]:
yTrain = pd.read_csv('../data/{file_name}.csv'.format(file_name = 'hg_yTrain'))
yTrain

Unnamed: 0,EquipmentID,EventTimeStamp,target,prediction
0,1328,2015-02-23 10:24:18,0.0,0.167260
1,1328,2015-02-25 07:24:50,0.0,0.167260
2,1328,2015-02-26 20:14:29,0.0,0.167260
3,1328,2015-02-26 23:56:07,0.0,0.031281
4,1328,2015-02-27 04:10:23,0.0,0.005586
...,...,...,...,...
427536,309,2019-09-25 07:25:07,0.0,0.000083
427537,309,2019-09-25 09:10:22,0.0,0.000076
427538,309,2019-09-25 09:10:22,0.0,0.000057
427539,309,2020-02-03 07:54:13,0.0,0.029828


**Test read in**

In [6]:
yTest = pd.read_csv('../data/{file_name}.csv'.format(file_name = 'hg_yTest'))
yTest

Unnamed: 0,EquipmentID,EventTimeStamp,target,prediction
0,1327,2015-05-04 15:38:35,0.0,0.020931
1,1327,2015-05-07 06:52:14,0.0,0.013098
2,1327,2015-05-19 12:02:55,0.0,0.472384
3,1327,2015-05-26 08:11:45,0.0,0.472384
4,1327,2015-05-26 08:44:10,0.0,0.824266
...,...,...,...,...
118874,310,2018-11-21 20:57:59,0.0,0.043526
118875,310,2019-10-09 23:32:35,0.0,0.043526
118876,310,2019-11-12 00:16:41,0.0,0.054404
118877,310,2019-11-12 00:16:41,0.0,0.023456


*Reading in data of derates that will be used to make a list. Will already be in the data folder.*

In [7]:
derates = pd.read_csv('../data/derates.csv').drop_duplicates()

*Making sure all time columns are a datetime.*

In [8]:
derates['EventTimeStamp'] = pd.to_datetime(derates['EventTimeStamp'])
yTrain['EventTimeStamp'] = pd.to_datetime(yTrain['EventTimeStamp'])
yTest['EventTimeStamp'] = pd.to_datetime(yTest['EventTimeStamp'])

In [9]:
derates = derates.loc[(derates.groupby('EquipmentID').diff().fillna(pd.to_timedelta(10,'d')) >= '7 D')['EventTimeStamp']]

In [10]:
derates.shape

(296, 2)

*The nasty function. Will try to explain what happens in comments if you are interested*

In [11]:
def model_eval(yTrain, yTest, derates = derates):
    #Creating a unique list of equipment ids that have had a derate.
    derateTrucks = derates['EquipmentID'].drop_duplicates().to_list()
    
    #Using the list made above to create a list of all the trucks that have had a derate in the training data. 
    trainDeratesTrucks = [x for x in derateTrucks if x in yTrain['EquipmentID'].to_list()]
    
    #The first for loop is removing rows around a derate so false postives can be calculated
    removal = []
    for truck in trainDeratesTrucks:
        dates = (
            derates
            .loc[derates['EquipmentID'] == truck]
            #.sort_values(by = 'EventTimeStamp')   #First chunk is going through and grabbing derate times by equipment id
            #.drop_duplicates('EquipmentID')
            .reset_index(drop = True)['EventTimeStamp']
        )
        for date in dates:
            pre = date - datetime.timedelta(hours = 24 * 7)   #calculating a week before and week after a derate
            post = date + datetime.timedelta(hours = 24 * 7)

            times = yTrain.loc[                      
                (yTrain['EquipmentID'] == truck) &     #Using those times above to filter yTrain 
                (yTrain['EventTimeStamp'] >= pre) &
                (yTrain['EventTimeStamp'] <= post)
            ].reset_index(drop = True)

            removal.append(times)    #appending to empty list so it can be turned into a dataframe
            
    removal = (
        yTrain
        .merge(
            pd.concat(removal),       #"Anti-merging" the dataframe we created above so we can remove those values
            how = 'outer', 
            on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
            indicator = True
        )
    )

    yTrainNoDerate = removal.loc[removal['_merge'] == 'left_only'] #finalizing the "anti-merge"
    
    ######################################################################################################################
    
    #These are the different threshold values I looped through. Feel free to change them if you see something.
    varience = [.99, .95, .9, .85, .8, .75, .7, .6, .5, .4, .3]
    hoursPre = [5, 4, 3 , 2, 1]

    #This for loop calculates the entire confusion matrix on the training data at different thresholds
    df = []
    for x in product(varience, hoursPre):      #This is looping through the list we created above
        truePos = []
        for truck in trainDeratesTrucks:
            dates = (
                derates
                .loc[derates['EquipmentID'] == truck]     #Same as the loop above this is grabbing times of derates
                #.sort_values(by = 'EventTimeStamp')
                #.drop_duplicates('EquipmentID')
                .reset_index(drop = True)['EventTimeStamp']
            )
            for date in dates:
                pre = date - datetime.timedelta(hours = 24) #Pre is the same as above
                post = date - datetime.timedelta(hours = x[1])  #Post is now 1 hour prior to a derate 

                times = yTrain.loc[
                    (yTrain['EquipmentID'] == truck) &
                    (yTrain['EventTimeStamp'] >= pre) &  #Using those times above to filter yTrain 
                    (yTrain['EventTimeStamp'] <= post)
                ].reset_index(drop = True)
                
                #searching to see if the max prediciton is greater than or equal to the set threshold
                truePos.append(int(times['prediction'].max() >= x[0])) #True Positive
        
        #Summing the times there was more than a day between when the model predicited there was a derate when there wasn't a derate
        falsePos = (                        #False Positive
            yTrainNoDerate
            .loc[yTrainNoDerate['prediction'] > x[0]]
            .groupby('EquipmentID')['EventTimeStamp']
            .diff()
            .fillna(pd.to_timedelta(5,'d')) > '1 D'
        ).sum()

        df.append({
            'threshold' : x[0],
            'hoursPre' : x[1],
            'truePos' : sum(truePos),
            'falseNeg' : len(truePos) - sum(truePos),  #Building the dataframe confusion matrix
            'falsePos' : falsePos,
            'trueNeg' : len(yTrainNoDerate) - falsePos
        })
        
    confusionMatrix = pd.DataFrame(df)
    #calculated the total gained or lost by our model
    confusionMatrix['netP_L'] = confusionMatrix['truePos'] * 4000 - confusionMatrix['falsePos'] * 500
    #ordering by best threshold to worst
    confusionMatrix = confusionMatrix.sort_values('netP_L', ascending = False).reset_index(drop = True)
    
    #####################################################################################################################
    
    #below are the same steps above but for the testing data and only using the best threshold that was found in the 
    #training data
    
    testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
    removalTest = []
    for truck in testDeratesTrucks:
        dates = (
            derates
            .loc[derates['EquipmentID'] == truck]
            #.sort_values(by = 'EventTimeStamp')
            #.drop_duplicates('EquipmentID')
            .reset_index(drop = True)['EventTimeStamp']
        )
        for date in dates:
            pre = date - datetime.timedelta(hours = 24 * 7)
            post = date + datetime.timedelta(hours = 24 * 7)

            times = yTest.loc[
                (yTest['EquipmentID'] == truck) &
                (yTest['EventTimeStamp'] >= pre) &
                (yTest['EventTimeStamp'] <= post)
            ].reset_index(drop = True)

            removalTest.append(times)
            
    removalTest = (
        yTest
        .merge(
            pd.concat(removalTest), 
            how = 'outer', 
            on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
            indicator = True
        )
    )

    yTestNoDerate = removalTest.loc[removalTest['_merge'] == 'left_only']
    
    ####################################################################################################################
    
    testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
    dfTest = []
    truePos = []
    for truck in testDeratesTrucks:
        dates = (
            derates
            .loc[derates['EquipmentID'] == truck]
            #.sort_values(by = 'EventTimeStamp')
            #.drop_duplicates('EquipmentID')
            .reset_index(drop = True)['EventTimeStamp']
        )
        for date in dates:
            pre = date - datetime.timedelta(hours = 24)
            post = date - datetime.timedelta(hours = int(confusionMatrix['hoursPre'][0]))

            times = yTest.loc[
                (yTest['EquipmentID'] == truck) &
                (yTest['EventTimeStamp'] >= pre) &
                (yTest['EventTimeStamp'] <= post)
            ].reset_index(drop = True)

            truePos.append(int(times['prediction'].max() >= confusionMatrix['threshold'][0]))

    falsePos = (
        yTestNoDerate
        .loc[yTestNoDerate['prediction'] > confusionMatrix['threshold'][0]]
        .groupby('EquipmentID')['EventTimeStamp']
        .diff()
        .fillna(pd.to_timedelta(5,'d')) > '1 D'
    ).sum()

    dfTest.append({
        'var' : confusionMatrix['threshold'][0],
        'hoursPre' : confusionMatrix['hoursPre'][0],
        'truePos' : sum(truePos),
        'falseNeg' : len(truePos) - sum(truePos),
        'falsePos' : falsePos,
        'trueNeg' : len(yTestNoDerate) - falsePos
    })
    
    confusionMatrixTest = pd.DataFrame(dfTest)
    confusionMatrixTest['netP_L'] = confusionMatrixTest['truePos'] * 4000 - confusionMatrixTest['falsePos'] * 500
    
    return confusionMatrix, confusionMatrixTest

In [12]:
confusionMatrixTrain = model_eval(yTrain, yTest)[0]

confusionMatrixTrain

Unnamed: 0,threshold,hoursPre,truePos,falseNeg,falsePos,trueNeg,netP_L
0,0.95,1,116,117,563,420320,182500
1,0.95,2,112,121,563,420320,166500
2,0.95,3,107,126,563,420320,146500
3,0.99,1,52,181,171,420712,122500
4,0.9,1,146,87,926,419957,121000
5,0.99,2,49,184,171,420712,110500
6,0.95,4,95,138,563,420320,98500
7,0.9,2,139,94,926,419957,93000
8,0.99,3,38,195,171,420712,66500
9,0.9,3,130,103,926,419957,57000


In [13]:
confusionMatrixTest = model_eval(yTrain, yTest)[1]

confusionMatrixTest

Unnamed: 0,var,hoursPre,truePos,falseNeg,falsePos,trueNeg,netP_L
0,0.95,1,26,37,140,117390,34000
