In [3]:
import pandas as pd
import numpy as np
import datetime

In [4]:
yTrain = pd.read_csv('../data/hg_yTrain.csv')
yTest = pd.read_csv('../data/hg_yTest.csv')
derates = pd.read_csv('../data/derates.csv').drop_duplicates()

In [5]:
yTrain

Unnamed: 0,EquipmentID,EventTimeStamp,target,prediction
0,1328,2015-02-23 10:24:18,0.0,0.167260
1,1328,2015-02-25 07:24:50,0.0,0.167260
2,1328,2015-02-26 20:14:29,0.0,0.167260
3,1328,2015-02-26 23:56:07,0.0,0.031281
4,1328,2015-02-27 04:10:23,0.0,0.005586
...,...,...,...,...
427536,309,2019-09-25 07:25:07,0.0,0.000083
427537,309,2019-09-25 09:10:22,0.0,0.000076
427538,309,2019-09-25 09:10:22,0.0,0.000057
427539,309,2020-02-03 07:54:13,0.0,0.029828


**Creating list of trucks for certian cases**

In [6]:
trainTrucks = yTrain['EquipmentID'].drop_duplicates().to_list()
testTrucks = yTest['EquipmentID'].drop_duplicates().to_list()
trucks = (trainTrucks + testTrucks)
derateTrucks = derates['EquipmentID'].drop_duplicates().to_list()
noDerateTrucks = [i for i in trucks if i not in derateTrucks]

**Truning columns to date times**

In [7]:
derates['EventTimeStamp'] = pd.to_datetime(derates['EventTimeStamp'])
yTrain['EventTimeStamp'] = pd.to_datetime(yTrain['EventTimeStamp'])
yTest['EventTimeStamp'] = pd.to_datetime(yTest['EventTimeStamp'])

In [8]:
derates = derates.loc[(derates.groupby('EquipmentID').diff().fillna(pd.to_timedelta(10,'d')) >= '7 D')['EventTimeStamp']]

**Removing the derates from the dataset**

In [9]:
trainDeratesTrucks = [x for x in derateTrucks if x in yTrain['EquipmentID'].to_list()]
removal = []
for truck in trainDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24 * 7)
        post = date + datetime.timedelta(hours = 24 * 7)
        
        times = yTrain.loc[
            (yTrain['EquipmentID'] == truck) &
            (yTrain['EventTimeStamp'] >= pre) &
            (yTrain['EventTimeStamp'] <= post)
        ].reset_index(drop = True)
        
        removal.append(times)

In [10]:
removal = (
    yTrain
    .merge(
        pd.concat(removal), 
        how = 'outer', 
        on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
        indicator = True
    )
)

yTrainNoDerate = removal.loc[removal['_merge'] == 'left_only']

**Evaulating the true confusion matrix per different predicition threshold**

In [11]:
trainDeratesTrucks = [x for x in derateTrucks if x in yTrain['EquipmentID'].to_list()]
varience = [.99, .95, .9, .85, .8, .75, .7, .6, .5, .4, .3]

df = []
for var in varience:
    truePos = []
    for truck in trainDeratesTrucks:
        dates = (
            derates
            .loc[derates['EquipmentID'] == truck]
            #.sort_values(by = 'EventTimeStamp')
            #.drop_duplicates('EquipmentID')
            .reset_index(drop = True)['EventTimeStamp']
        )
        for date in dates:
            pre = date - datetime.timedelta(hours = 24)
            post = date - datetime.timedelta(hours = 1)

            times = yTrain.loc[
                (yTrain['EquipmentID'] == truck) &
                (yTrain['EventTimeStamp'] >= pre) &
                (yTrain['EventTimeStamp'] <= post)
            ].reset_index(drop = True)

            truePos.append(int(times['prediction'].max() >= var))
    
    falsePos = (
        yTrainNoDerate
        .loc[yTrainNoDerate['prediction'] > var]
        .groupby('EquipmentID')['EventTimeStamp']
        .diff()
        .fillna(pd.to_timedelta(5,'d')) > '1 D'
    ).sum()
    
    df.append({
        'var' : var,
        'truePos' : sum(truePos),
        'falseNeg' : len(truePos) - sum(truePos),
        'falsePos' : falsePos,
        'trueNeg' : len(yTrainNoDerate) - falsePos
    })

In [12]:
confusionMatrix = pd.DataFrame(df)
confusionMatrix['netP_L'] = confusionMatrix['truePos'] * 4000 - confusionMatrix['falsePos'] * 500

confusionMatrix.sort_values('netP_L', ascending = False)

Unnamed: 0,var,truePos,falseNeg,falsePos,trueNeg,netP_L
1,0.95,116,117,563,420320,182500
0,0.99,52,181,171,420712,122500
2,0.9,146,87,926,419957,121000
3,0.85,165,68,1243,419640,38500
4,0.8,170,63,1459,419424,-49500
5,0.75,171,62,1589,419294,-110500
6,0.7,174,59,1706,419177,-157000
7,0.6,175,58,1830,419053,-215000
8,0.5,178,55,2028,418855,-302000
9,0.4,180,53,2290,418593,-425000


In [13]:
# trainNoDerates = yTrain['EquipmentID'].to_list()
# falsePos = 0
# for truck in trainNoDerates:
    
#     falsePos += (yTrainNoDerate.loc[
#         (yTrainNoDerate['EquipmentID'] == truck) &
#         (yTrainNoDerate['prediction'] > .9)
#     ]['EventTimeStamp'].diff().fillna(pd.to_timedelta(5,'d')) > '1 D').sum()
    
# falsePos

In [14]:
# for var in varience:
#     falsePos = (
#         yTrainNoDerate
#         .loc[yTrainNoDerate['prediction'] > var]
#         .groupby('EquipmentID')['EventTimeStamp']
#         .diff()
#         .fillna(pd.to_timedelta(5,'d')) > '1 D'
#     ).sum()
    
#     df.append({
#         'falsePos' : falsePos,
#         'trueNeg' : len(yTrainNoDerate) - falsePos
#     })

**Running the best varience on the test data to see if a net gain was achieved** 

In [15]:
testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
removalTest = []
for truck in testDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24 * 7)
        post = date + datetime.timedelta(hours = 24 * 7)
        
        times = yTest.loc[
            (yTest['EquipmentID'] == truck) &
            (yTest['EventTimeStamp'] >= pre) &
            (yTest['EventTimeStamp'] <= post)
        ].reset_index(drop = True)
        
        removalTest.append(times)

In [16]:
removalTest = (
    yTest
    .merge(
        pd.concat(removalTest), 
        how = 'outer', 
        on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
        indicator = True
    )
)

yTestNoDerate = removalTest.loc[removalTest['_merge'] == 'left_only']

In [17]:
# testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
# truePos = []
# for truck in testDeratesTrucks:
#     dates = (
#         derates
#         .loc[derates['EquipmentID'] == truck]
#         .sort_values(by = 'EventTimeStamp')
#         .drop_duplicates('EquipmentID')
#         .reset_index(drop = True)['EventTimeStamp']
#     )
#     for date in dates:
#         pre = date - datetime.timedelta(hours = 24 * 7)
#         post = date - datetime.timedelta(hours = 1)
        
#         times = yTest.loc[
#             (yTest['EquipmentID'] == truck) &
#             (yTest['EventTimeStamp'] >= pre) &
#             (yTest['EventTimeStamp'] <= post)
#         ].reset_index(drop = True)
        
#         truePos.append(int(times['prediction'].max() >= .9))

In [18]:
testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
dfTest = []
truePos = []
for truck in testDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24)
        post = date - datetime.timedelta(hours = 1)

        times = yTest.loc[
            (yTest['EquipmentID'] == truck) &
            (yTest['EventTimeStamp'] >= pre) &
            (yTest['EventTimeStamp'] <= post)
        ].reset_index(drop = True)

        truePos.append(int(times['prediction'].max() >= .95))
    
falsePos = (
    yTestNoDerate
    .loc[yTestNoDerate['prediction'] > .95]
    .groupby('EquipmentID')['EventTimeStamp']
    .diff()
    .fillna(pd.to_timedelta(5,'d')) > '1 D'
).sum()
    
dfTest.append({
    'var' : .95,
    'truePos' : sum(truePos),
    'falseNeg' : len(truePos) - sum(truePos),
    'falsePos' : falsePos,
    'trueNeg' : len(yTestNoDerate) - falsePos
})

In [19]:
confusionMatrixTest = pd.DataFrame(dfTest)
confusionMatrixTest['netP_L'] = confusionMatrixTest['truePos'] * 4000 - confusionMatrixTest['falsePos'] * 500

confusionMatrixTest.sort_values('netP_L', ascending = False)

Unnamed: 0,var,truePos,falseNeg,falsePos,trueNeg,netP_L
0,0.95,26,37,140,117390,34000
