# Big G Express Model Data Prep

## Imports

In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

## Read in

In [163]:
faults = pd.read_pickle('../data/faults_df.pickle')
diag = pd.read_pickle('../data/diag_df.pickle')
fdwide = pd.read_pickle('../data/fdwide_df.pickle')

## Add grouping column to fdwide: eventGroup

Note that this event grouper includes 75% derate events. If we choose not to include those events, that part of the logic can easily be removed.

In [164]:
fdwide = fdwide.sort_values(['EquipmentID', 'EventTimeStamp']).reset_index(drop = True)

In [165]:
increment_check_full_only = (

                (fdwide['spn'].shift() == 5246)  # full derate in above row
                | 
                (fdwide['EquipmentID'] != fdwide['EquipmentID'].shift())    # Current row is different truck from previous row

    )

fdwide['eventGroup'] = increment_check_full_only.cumsum()

## Create target column within target_hours_window number of hours

In [166]:
target_window_hours = 48
eventGroupMaxIndexTransform = fdwide.groupby('eventGroup')['EventTimeStamp'].transform('idxmax')
eventGroupEndDerate = (fdwide.loc[eventGroupMaxIndexTransform, 'spn'] == 5246).reset_index(drop = True)

fdwide['timeTillLast'] = fdwide.groupby('eventGroup')['EventTimeStamp'].transform(max) - fdwide['EventTimeStamp']
fdwide[f'derateWi{target_window_hours}Hours'] = (fdwide['timeTillLast'] < dt.timedelta(hours = target_window_hours)) & eventGroupEndDerate

In [168]:
# For exploration of how event groups conclude
eG = 9
(fdwide.loc[fdwide['eventGroup'] == eG, 
          ['EventTimeStamp','EquipmentID','eventGroup', 'spn', 'fmi', 'timeTillLast',f'derateWi{target_window_hours}Hours']]
          .tail(15)
)

Unnamed: 0,EventTimeStamp,EquipmentID,eventGroup,spn,fmi,timeTillLast,derateWi48Hours
5137,2020-01-07 03:08:33,302,9,1213,5,6 days 09:06:31,False
5138,2020-01-07 12:02:12,302,9,1213,5,6 days 00:12:52,False
5139,2020-01-13 02:38:29,302,9,609,12,0 days 09:36:35,True
5140,2020-01-13 07:06:03,302,9,609,12,0 days 05:09:01,True
5141,2020-01-13 12:15:04,302,9,5246,19,0 days 00:00:00,True


## Appendix

### Save this to build a functon for various model choices

In [None]:
increment_check_either = (
    
                (fdwide['spn'].shift() == 5246)  # full derate in above row
                | 
                ((fdwide['spn'].shift() == 1596) & (fdwide['spn'].shift() == 31))   # partial derate in above row
                | 
                (fdwide['EquipmentID'] != fdwide['EquipmentID'].shift())    # Current row is different truck from previous row

    )

#fdwide['eventGroupEither'] = increment_check_full_only.cumsum()
#fdwide['timeTillEitherLast'] = fdwide.groupby('eventGroupFull')['EventTimeStamp'].transform(max) - fdwide['EventTimeStamp']
#fdwide['within24HoursEither'] = fdwide['timeTillEitherLast'] < dt.timedelta(hours = 24)