# Big G Express Data Preparaton & Preliminary Explorations

## Import

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Read in

In [77]:
faults = pd.read_csv('../data/J1939Faults.csv')
diag = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')
makes = pd.read_excel('../data/Vehicle_Make.xlsx')
fault_codes = pd.read_excel('../data/Service Fault Codes_1_0_0_167.xlsx')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  warn(msg)


## Glance at each df, convert datatypes, and prepare for merging

### faults
- 20 columns
- 1,187,335 rows
- EventTimeStamp, LocationTimeStamp cast as datetimes
- Many spn-fmi combinations have multiple rows in the dataset, usually one with an Algorithmic Description and another without one
- 1,187,335 distinct FaultIds

In [78]:
faults.shape

(1187335, 20)

In [79]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


In [80]:
for col in ['EventTimeStamp', 'LocationTimeStamp']:
    faults[col] = faults[col].astype('datetime64[ns]')

### diag (from Onboard Diagnostic Data)
- 4 columns
- 12,821,626 rows
- The data is in long form. The numbers use commas as decimals, so we'll replace those so we can change the wideform columns to floats.

In [81]:
diag.shape

(12821626, 4)

In [82]:
diag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12821626 entries, 0 to 12821625
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   Id       int64 
 1   Name     object
 2   Value    object
 3   FaultId  int64 
dtypes: int64(2), object(2)
memory usage: 391.3+ MB


In [83]:
diag['FaultId'].nunique()

1187335

In [84]:
diag.loc[diag['Value'].str.contains(',')]

Unnamed: 0,Id,Name,Value,FaultId
200754,201459,EngineOilPressure,7598,18402
200755,201460,EngineOilTemperature,177575,18402
200757,201462,TurboBoostPressure,3335,18402
200760,201465,FuelLtd,1786793716715,18402
200761,201466,FuelRate,2068474,18402
...,...,...,...,...
5632164,5640222,IntakeManifoldTemperature,986,545727
5632166,5640224,EngineRpm,12615,545727
5632167,5640225,BarometricPressure,142825,545727
5632169,5640227,Speed,6447697,545727


In [85]:
diag.loc[diag['Value'].str.contains(','), 'Name'].unique()

array(['EngineOilPressure', 'EngineOilTemperature', 'TurboBoostPressure',
       'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 'EngineRpm',
       'BarometricPressure', 'FuelLevel', 'Speed', 'EngineTimeLtd',
       'EngineCoolantTemperature', 'SwitchedBatteryVoltage',
       'DistanceLtd', 'FuelTemperature', 'Throttle', 'AcceleratorPedal'],
      dtype=object)

In [86]:
diag['Value'] = diag['Value'].str.replace(',', '.')

## fault_codes

- Collection of references to interpret the faults spn and fmi columns
- Focus here is therefore on SPN, J1939 FMI (FMI pertaining to derates), as well as the interpretations: Lamp Color, Lamp Device, Cummins Description, Algorithm Description
- Some SPN, J1939 FMI pairs have multiple rows, and in some cases the differences are nontrivial. For example, these lines:

    |Cummins Fault Code|	SPN	|J1939 FMI	|Lamp Color|	Lamp Device|	Cummins Description	|Algorithm Description|
    |------------------|---------|----------|----------|----------------|-----------------------|----------------------|
    |2519|	51|	2|	Red|	Stop / Shutdown|	Throttle Position Sensor - Data Erratic, Inter...	|Throttle actuator control lost|
    |3542|	51|	2|	Amber|	Warning	|Engine Intake Throttle Actuator Position Senso...	|NaN|

In [87]:
fault_codes.shape

(7124, 14)

In [88]:
fault_codes[fault_codes['SPN']==629].sort_values(['SPN', 'J1939 FMI']).head(15)

Unnamed: 0,Published in CES 14602,Cummins Fault Code,Revision,PID,SID,MID,J1587 FMI,SPN,J1939 FMI,J2012 Pcode,Lamp Color,Lamp Device,Cummins Description,Algorithm Description
315,Y,525,167,Not Mapped,254,0,3,629,3,Not Mapped,Red,Stop / Shutdown,Engine Control Module Not Calibrated - Voltage...,The ECM is Not Calibrated.
4986,Y,7154,167,Not Mapped,Not Mapped,Not Mapped,3,629,3,Not Mapped,Amber,Warning,Engine Control Module Circuit - Voltage Above ...,
4987,Y,7155,167,Not Mapped,Not Mapped,Not Mapped,4,629,4,Not Mapped,Amber,Warning,Engine Control Module Circuit - Voltage Below ...,
2390,Y,3549,167,Not Mapped,Not Mapped,Not Mapped,7,629,7,Not Mapped,Amber,Warning,Engine Control Module Warning Internal Hardwar...,Dual pull-up resistors for temperature sensors...
1841,Y,2768,167,Not Mapped,Not Mapped,Not Mapped,9,629,9,Not Mapped,Red,Stop / Shutdown,Engine Network Master Slave Communication - Ab...,The communication between master and slave mod...
4258,Y,6155,167,Not Mapped,Not Mapped,Not Mapped,11,629,11,Not Mapped,Amber,Warning,Engine Control Module - Root Cause Not Known,
0,Y,111,167,Not Mapped,254,0,12,629,12,P0606,Red,Stop / Shutdown,Engine Control Module Critical Internal Failur...,Error internal to the ECM related to memory ha...
180,Y,343,167,Not Mapped,254,0,12,629,12,P0607,Amber,Warning,Engine Control Module Warning Internal Hardwar...,ECM power supply errors have been detected.
689,Y,1116,167,Not Mapped,254,0,12,629,12,Not Mapped,Amber,Warning,Engine Control Module Critical Internal Failur...,ECM Internal failure has occurred.
854,Y,1388,167,Not Mapped,254,0,12,629,12,Not Mapped,,,Engine Control Module Data Lost - Bad Intellig...,The ECM data has been lost.


In [89]:
fault_codes = fault_codes[['Cummins Fault Code', 'SPN', 'J1939 FMI', 'Lamp Color', 'Lamp Device', 'Cummins Description', 'Algorithm Description']]

activeTransitionCount seems mislabeled. It doesn't increment when active toggles for a given vehicle.

Hypothesis: activeTransitionCount is a mislabeled Cummins Fault Code

Conclusion: Hypothesis is false. The Cummins Fault Codes don't match the activeTransitionCounts for the same spn-fmi pairs.

Correction: activeTransitionCount does increment, but only until it reaches 127. We may want to build our own modified counter.

In [90]:
faults['spn-fmi'] = faults['spn'].astype(str) + '-' + faults['fmi'].astype(str)

In [91]:
(faults
    .groupby('activeTransitionCount')['spn-fmi']
    .nunique()
    .sort_values(ascending = False)
)

activeTransitionCount
1      952
2      618
3      461
4      408
5      360
      ... 
115     57
109     57
118     55
117     54
120     48
Name: spn-fmi, Length: 128, dtype: int64

In [92]:
(faults
    .groupby('spn-fmi')['activeTransitionCount']
    .nunique()
    .sort_values(ascending = False)
)

spn-fmi
1569-31      127
802-4        127
523531-31    127
111-18       127
791-7        127
            ... 
2795-1         1
5625-13        1
5616-18        1
5615-16        1
98-5           1
Name: activeTransitionCount, Length: 1064, dtype: int64

In [93]:
fault_codes.head(20)

Unnamed: 0,Cummins Fault Code,SPN,J1939 FMI,Lamp Color,Lamp Device,Cummins Description,Algorithm Description
0,111,629,12,Red,Stop / Shutdown,Engine Control Module Critical Internal Failur...,Error internal to the ECM related to memory ha...
1,112,635,7,Red,Stop / Shutdown,Engine Timing Actuator Driver Circuit - Mechan...,Mechanical failure in the engine timing actuat...
2,113,635,3,Amber,Warning,Engine Timing Actuator Driver Circuit - Voltag...,High signal voltage detected at the engine tim...
3,114,635,4,Amber,Warning,Engine Timing Actuator Driver Circuit - Voltag...,Low voltage detected at the engine timing actu...
4,115,612,2,Red,Stop / Shutdown,Engine Magnetic Speed/Position Lost Both of Tw...,The ECM has detected that the primary and back...
5,116,156,3,Red,Stop / Shutdown,Injector Timing Rail 1 Fuel Pressure Sensor Ci...,High signal voltage detected at the Injector T...
6,117,156,4,Red,Stop / Shutdown,Injector Timing Rail 1 Fuel Pressure Sensor Ci...,Low signal voltage detected at the Injector Ti...
7,118,135,3,Amber,Warning,Fuel Delivery Pressure 1 Sensor Circuit - Volt...,High signal voltage detected at the fuel deliv...
8,119,135,4,Amber,Warning,Fuel Delivery Pressure 1 Sensor Circuit - Volt...,Low signal voltage detected at the fuel delive...
9,121,190,10,Amber,Warning,Engine Magnetic Crankshaft Speed/Position Lost...,The ECM has lost one of the two speed/position...


In [94]:
faults.loc[faults['spn'] == 628, ['spn', 'fmi', 'activeTransitionCount']].head()

Unnamed: 0,spn,fmi,activeTransitionCount
134401,628,11,1
134404,628,11,1
391488,628,11,1
391496,628,11,1
456390,628,11,1


In [95]:
faults[faults['activeTransitionCount'] == 127]

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,...,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,spn-fmi
1,2,990360,2015-02-21 11:34:34,,,unknown,unknown,unknown,unknown,11,...,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10,629-12
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,...,2,False,127,,1369,105336226,41.421250,-87.767361,2015-02-21 11:35:26,1807-2
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,...,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08,1807-2
14,15,990494,2015-02-21 11:14:38,Incorrect Data Brake Signal Sensor 1,,unknown,unknown,unknown,unknown,11,...,2,True,127,,309,105442799,36.181898,-86.695046,2015-02-21 11:44:52,1067-2
31,32,990702,2015-02-21 11:57:37,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,...,2,False,127,,1369,105336226,41.427870,-87.756759,2015-02-21 11:57:32,1807-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187021,1248145,123740230,2020-03-05 14:07:54,Special Instructions Parking Brake Switch,,5569976 *E245.e002*,K1236156,FAOM-xx810S-EC3,EATON,3,...,14,True,127,,2082,108087554,36.066944,-86.434120,2020-03-05 14:17:19,70-14
1187231,1248355,123847884,2020-03-06 09:08:47,Incorrect Data J1939 Network #1 Primary Vehicl...,,unknown,unknown,unknown,unknown,11,...,2,True,127,,2271,105439765,36.066990,-86.433796,2020-03-06 08:32:18,639-2
1187232,1248356,123850076,2020-03-06 09:16:59,Incorrect Data J1939 Network #1 Primary Vehicl...,,unknown,unknown,unknown,unknown,11,...,2,False,127,,2271,105439765,36.066990,-86.433796,2020-03-06 08:32:18,639-2
1187241,1248365,123852183,2020-03-06 09:23:24,Incorrect Data J1939 Network #1 Primary Vehicl...,,unknown,unknown,unknown,unknown,11,...,2,True,127,,2089,108004984,29.265370,-82.189629,2020-03-06 09:24:01,639-2


### makes

We've discovered that makes has no referential integrity with the rest of the data.

## Merge faults and diag

- Widen diag so that each fault name has its own column (though we'll get a lot of NaNs). Call this diag_wide.
- Make booleans actual boolean values rather than objects. Note that columns containing booleans and NaNs are technically mixed type and thus still "object" type.
- Cast the rest of the columns as floats.
- Merge faults and diag_wide on RecordID and FaultId, respectively. Called fdwide.
- Merge faults_diag_wide with fault_codes on spn and fmi.
    - This creates duplicate RecordID/FaultId values/rows because fault_codes has multiple lines for the same spn and fmi pairs. We should deal with this.
        - [ ] Option 1: Only take the first value, prioritizing those with non-null Algorithm Descriptions
        - [ ] Option 2: Widen the spn/fmi pairs to have multiple entries for Lamp Color, Lamp Device, Cummins Description, and Algorithm Description
        - [x] Option 3: Wait to incorporate fault_codes until we need to interpret results. Just work with fdwide.
- Pickle faults, diag, and fdwide for use in other notebooks.

In [96]:
diag_wide = diag.pivot(index = ['FaultId'],columns = 'Name', values = 'Value')

diag_wide_booleans = ['CruiseControlActive', 'IgnStatus', 'ParkingBrake']
boolean_mapper = {'True':True, 'False':False}

for col in diag_wide_booleans:
    diag_wide[col] = diag_wide[col].map(boolean_mapper)

for col in diag_wide.loc[:,~diag_wide.columns.isin(diag_wide_booleans)].columns:
    diag_wide[col] = diag_wide[col].astype(float)

fdwide = faults.merge(diag_wide.reset_index(), left_on = 'RecordID', right_on='FaultId')

full = fdwide.merge(fault_codes, 
                            left_on = ['spn', 'fmi'], 
                            right_on = ['SPN', 'J1939 FMI'], 
                            how = 'left'
    )

In [97]:
print('fdwide shape: ', fdwide.shape)
print('fdwide dedup shape: ', fdwide.drop_duplicates(keep = 'first').shape)
print('full: ', full.shape)
print('full dedup: ', full.drop_duplicates(subset = fdwide.columns, keep = 'first').shape)
print('full origin columns dedup: ', full[fdwide.columns].drop_duplicates(subset = fdwide.columns, keep = 'first').shape)
#full[~((full.duplicated(subset = ['RecordID', 'spn', 'fmi'], keep = False))&(full['Algorithm Description'].isnull()))]

fdwide shape:  (1187335, 46)
fdwide dedup shape:  (1187335, 46)
full:  (1773397, 53)
full dedup:  (1187335, 53)
full origin columns dedup:  (1187335, 46)


In [98]:
faults.to_pickle('../data/faults_df.pickle')
diag.to_pickle('../data/diag_df.pickle')
fdwide.to_pickle('../data/fdwide_df.pickle')