In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
yTrain = pd.read_csv('../data/yTrain.csv')
yTest = pd.read_csv('../data/yTest.csv')
derates = pd.read_csv('../data/derates.csv').drop_duplicates()

**Creating list of trucks for certian cases**

In [3]:
trainTrucks = yTrain['EquipmentID'].drop_duplicates().to_list()
testTrucks = yTest['EquipmentID'].drop_duplicates().to_list()
trucks = (trainTrucks + testTrucks)
derateTrucks = derates['EquipmentID'].drop_duplicates().to_list()
noDerateTrucks = [i for i in trucks if i not in derateTrucks]

**Truning columns to date times**

In [4]:
derates['EventTimeStamp'] = pd.to_datetime(derates['EventTimeStamp'])
yTrain['EventTimeStamp'] = pd.to_datetime(yTrain['EventTimeStamp'])
yTest['EventTimeStamp'] = pd.to_datetime(yTest['EventTimeStamp'])

**Removing the derates from the dataset**

In [5]:
trainDeratesTrucks = [x for x in derateTrucks if x in yTrain['EquipmentID'].to_list()]
removal = []
for truck in trainDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24 * 7)
        post = date + datetime.timedelta(hours = 24 * 7)
        
        times = yTrain.loc[
            (yTrain['EquipmentID'] == truck) &
            (yTrain['EventTimeStamp'] >= pre) &
            (yTrain['EventTimeStamp'] <= post)
        ].reset_index(drop = True)
        
        removal.append(times)

In [6]:
removal = (
    yTrain
    .merge(
        pd.concat(removal), 
        how = 'outer', 
        on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
        indicator = True
    )
)

yTrainNoDerate = removal.loc[removal['_merge'] == 'left_only']

**Evaulating the true confusion matrix per different predicition threshold**

In [7]:
trainDeratesTrucks = [x for x in derateTrucks if x in yTrain['EquipmentID'].to_list()]
varience = [.99, .95, .9, .85, .8, .75, .7, .6, .5, .4, .3]

df = []
for var in varience:
    truePos = []
    for truck in trainDeratesTrucks:
        dates = (
            derates
            .loc[derates['EquipmentID'] == truck]
            #.sort_values(by = 'EventTimeStamp')
            #.drop_duplicates('EquipmentID')
            .reset_index(drop = True)['EventTimeStamp']
        )
        for date in dates:
            pre = date - datetime.timedelta(hours = 24 * 7)
            post = date - datetime.timedelta(hours = 1)

            times = yTrain.loc[
                (yTrain['EquipmentID'] == truck) &
                (yTrain['EventTimeStamp'] >= pre) &
                (yTrain['EventTimeStamp'] <= post)
            ].reset_index(drop = True)

            truePos.append(int(times['prediction'].max() >= var))
    
    falsePos = (
        yTrainNoDerate
        .loc[yTrainNoDerate['prediction'] > var]
        .groupby('EquipmentID')['EventTimeStamp']
        .diff()
        .fillna(pd.to_timedelta(5,'d')) > '1 D'
    ).sum()
    
    df.append({
        'var' : var,
        'truePos' : sum(truePos),
        'falseNeg' : len(truePos) - sum(truePos),
        'falsePos' : falsePos,
        'trueNeg' : len(yTrainNoDerate) - falsePos
    })

In [8]:
confusionMatrix = pd.DataFrame(df)
confusionMatrix['netP_L'] = confusionMatrix['truePos'] * 4000 - confusionMatrix['falsePos'] * 500

confusionMatrix.sort_values('netP_L', ascending = False)

Unnamed: 0,var,truePos,falseNeg,falsePos,trueNeg,netP_L
1,0.95,259,131,557,420261,757500
2,0.9,297,93,919,419899,728500
3,0.85,321,69,1236,419582,666000
4,0.8,326,64,1452,419366,578000
5,0.75,327,63,1582,419236,517000
6,0.7,338,52,1699,419119,502500
0,0.99,142,248,169,420649,483500
7,0.6,339,51,1823,418995,444500
8,0.5,343,47,2021,418797,361500
9,0.4,344,46,2283,418535,234500


In [9]:
# trainNoDerates = yTrain['EquipmentID'].to_list()
# falsePos = 0
# for truck in trainNoDerates:
    
#     falsePos += (yTrainNoDerate.loc[
#         (yTrainNoDerate['EquipmentID'] == truck) &
#         (yTrainNoDerate['prediction'] > .9)
#     ]['EventTimeStamp'].diff().fillna(pd.to_timedelta(5,'d')) > '1 D').sum()
    
# falsePos

In [10]:
# for var in varience:
#     falsePos = (
#         yTrainNoDerate
#         .loc[yTrainNoDerate['prediction'] > var]
#         .groupby('EquipmentID')['EventTimeStamp']
#         .diff()
#         .fillna(pd.to_timedelta(5,'d')) > '1 D'
#     ).sum()
    
#     df.append({
#         'falsePos' : falsePos,
#         'trueNeg' : len(yTrainNoDerate) - falsePos
#     })

**Running the best varience on the test data to see if a net gain was achieved** 

In [11]:
testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
removalTest = []
for truck in testDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24 * 7)
        post = date + datetime.timedelta(hours = 24 * 7)
        
        times = yTest.loc[
            (yTest['EquipmentID'] == truck) &
            (yTest['EventTimeStamp'] >= pre) &
            (yTest['EventTimeStamp'] <= post)
        ].reset_index(drop = True)
        
        removalTest.append(times)

In [12]:
removalTest = (
    yTest
    .merge(
        pd.concat(removalTest), 
        how = 'outer', 
        on = ['EquipmentID', 'EventTimeStamp', 'target', 'prediction'], 
        indicator = True
    )
)

yTestNoDerate = removalTest.loc[removalTest['_merge'] == 'left_only']

In [13]:
# testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
# truePos = []
# for truck in testDeratesTrucks:
#     dates = (
#         derates
#         .loc[derates['EquipmentID'] == truck]
#         .sort_values(by = 'EventTimeStamp')
#         .drop_duplicates('EquipmentID')
#         .reset_index(drop = True)['EventTimeStamp']
#     )
#     for date in dates:
#         pre = date - datetime.timedelta(hours = 24 * 7)
#         post = date - datetime.timedelta(hours = 1)
        
#         times = yTest.loc[
#             (yTest['EquipmentID'] == truck) &
#             (yTest['EventTimeStamp'] >= pre) &
#             (yTest['EventTimeStamp'] <= post)
#         ].reset_index(drop = True)
        
#         truePos.append(int(times['prediction'].max() >= .9))

In [14]:
testDeratesTrucks = [x for x in derateTrucks if x in yTest['EquipmentID'].to_list()]
dfTest = []
truePos = []
for truck in testDeratesTrucks:
    dates = (
        derates
        .loc[derates['EquipmentID'] == truck]
        #.sort_values(by = 'EventTimeStamp')
        #.drop_duplicates('EquipmentID')
        .reset_index(drop = True)['EventTimeStamp']
    )
    for date in dates:
        pre = date - datetime.timedelta(hours = 24 * 7)
        post = date - datetime.timedelta(hours = 1)

        times = yTest.loc[
            (yTest['EquipmentID'] == truck) &
            (yTest['EventTimeStamp'] >= pre) &
            (yTest['EventTimeStamp'] <= post)
        ].reset_index(drop = True)

        truePos.append(int(times['prediction'].max() >= .95))
    
falsePos = (
    yTestNoDerate
    .loc[yTestNoDerate['prediction'] > .95]
    .groupby('EquipmentID')['EventTimeStamp']
    .diff()
    .fillna(pd.to_timedelta(5,'d')) > '1 D'
).sum()
    
dfTest.append({
    'var' : .95,
    'truePos' : sum(truePos),
    'falseNeg' : len(truePos) - sum(truePos),
    'falsePos' : falsePos,
    'trueNeg' : len(yTestNoDerate) - falsePos
})

In [15]:
confusionMatrixTest = pd.DataFrame(dfTest)
confusionMatrixTest['netP_L'] = confusionMatrixTest['truePos'] * 4000 - confusionMatrixTest['falsePos'] * 500

confusionMatrixTest.sort_values('netP_L', ascending = False)

Unnamed: 0,var,truePos,falseNeg,falsePos,trueNeg,netP_L
0,0.95,64,38,138,117353,187000
