# Big G Express EDA

## Import

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime as dt
from my_functions import create_target_window, stratifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score
from scipy.stats import chi2_contingency, f_oneway
import pickle

## Read in

In [65]:
faults = pd.read_pickle('../data/faults_df.pickle')
diag = pd.read_pickle('../data/diag_df.pickle')
fdwide = pd.read_pickle('../data/fdwide_df.pickle')

In [66]:
# Create target get column for full derate within 48 hours
full48 = create_target_window(fdwide, both_derate_types=False, target_window_hours=48)

## Get some datatypes organized, check NaNs

In [67]:
full48['LampStatus'] = full48['LampStatus'].astype(int).astype(str)

In [68]:
full48.isna().sum()[:20]

RecordID                       0
ESS_Id                         0
EventTimeStamp                 0
eventDescription           60845
actionDescription        1187335
ecuSoftwareVersion        296050
ecuSerialNumber           343017
ecuModel                   64758
ecuMake                    64758
ecuSource                      0
spn                            0
fmi                            0
active                         0
activeTransitionCount          0
faultValue               1187335
EquipmentID                    0
MCTNumber                      0
Latitude                       0
Longitude                      0
LocationTimeStamp              0
dtype: int64

In [69]:
full48.isna().sum()[20:40]

spn-fmi                           0
FaultId                           0
AcceleratorPedal             655446
BarometricPressure           601359
CruiseControlActive          612419
CruiseControlSetSpeed        610877
DistanceLtd                  601516
EngineCoolantTemperature     601264
EngineLoad                   601714
EngineOilPressure            601091
EngineOilTemperature         603423
EngineRpm                    600414
EngineTimeLtd                605969
FuelLevel                    684540
FuelLtd                      602140
FuelRate                     602098
FuelTemperature              888225
IgnStatus                    578881
IntakeManifoldTemperature    601044
LampStatus                        0
dtype: int64

In [70]:
full48.isna().sum()[40:]

ParkingBrake               787363
ServiceDistance           1187120
Speed                      603419
SwitchedBatteryVoltage    1073276
Throttle                   766832
TurboBoostPressure         603984
eventGroup                      0
timeTillLast                    0
target                          0
dtype: int64

## Throw out irrelevant columns or columns with too many nulls

In [90]:
dropcols = ['RecordID','actionDescription','ESS_Id','eventDescription', 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake', 'ecuSource', 
            'faultValue', 'EquipmentID', 'MCTNumber', 'LocationTimeStamp', 'eventGroup', 'FaultId', 'EventTimeStamp', 'ParkingBrake', 'ServiceDistance',
            'SwitchedBatteryVoltage', 'FuelTemperature', 'Throttle', 'LampStatus']
catcols = ['spn', 'fmi', 'spn-fmi']
X = full48.assign(timeTillLast = full48['timeTillLast'].dt.total_seconds()).drop(columns = dropcols).dropna()
X.shape

(431977, 27)

## Determine imputation strategies for fault-related variables

I will write up more about this, but I think imputation is very risky if it's going to be for over half the values in the column. Before that, let's check if any of our fault variables are correlated with each other.

## Check for distribution relationships to target variable(s)
- Pearson correlations with timeTillLast (time till derate)
- Chi2 for categorical variables
    - spn
    - fmi
- ANOVA for numeric variables
    - active
    - activeTransitionCount
    - Latitude
    - Longitude
    - AcceleratorPedal
    - BarometricPressure
    - CruiseControlActive
    - CruiseControlSetSpeed
    - DistanceLtd
    - EngineCoolantTemperature
    - EngineLoad
    - EngineOilPressure
    - EngineOilTemperature
    - EngineRpm
    - EngineTimeLtd
    - FuelLevel
    - FuelLtd
    - FuelRate
    - IgnStatus
    - IntakeManifoldTemperature
    - LampStatus
    - Speed
    - TurboBoostPressure
    - timeTillLast
    - target

In [92]:
numcols = X.drop(columns = catcols+['active']).columns

Since n is very high, p-values don't really help us distinguish truly meaningful values. Instead, we should look just at 
- highest and lowest correlation
- ANOVA effect sizes

In [73]:
corr_long = X[numcols].corr().melt(ignore_index = False)
corr_long[corr_long['value'] != 1].sort_values('value', ascending = False).drop_duplicates('value').head(25)

Unnamed: 0,variable,value
FuelRate,EngineLoad,0.950496
TurboBoostPressure,FuelRate,0.898928
FuelLtd,DistanceLtd,0.886513
TurboBoostPressure,EngineLoad,0.828309
DistanceLtd,EngineTimeLtd,0.757848
EngineTimeLtd,FuelLtd,0.75496
EngineOilPressure,EngineRpm,0.749697
Speed,EngineRpm,0.742469
FuelRate,AcceleratorPedal,0.701349
EngineCoolantTemperature,EngineOilTemperature,0.682758


In [74]:
corr_long[corr_long['value'] != 1].sort_values('value', ascending = False).drop_duplicates('value').tail(25)

Unnamed: 0,variable,value
EngineTimeLtd,Speed,-0.073595
EngineTimeLtd,activeTransitionCount,-0.075009
AcceleratorPedal,activeTransitionCount,-0.081378
BarometricPressure,EngineCoolantTemperature,-0.083197
EngineRpm,BarometricPressure,-0.083707
BarometricPressure,FuelRate,-0.083983
target,timeTillLast,-0.090682
activeTransitionCount,EngineLoad,-0.094079
BarometricPressure,EngineOilTemperature,-0.100923
activeTransitionCount,EngineRpm,-0.101817


In [75]:
corr_long[corr_long['variable'] == 'timeTillLast'].sort_values('value', ascending = False)

Unnamed: 0,variable,value
timeTillLast,timeTillLast,1.0
activeTransitionCount,timeTillLast,0.131135
EngineOilPressure,timeTillLast,0.075228
FuelLevel,timeTillLast,0.074273
CruiseControlSetSpeed,timeTillLast,0.068367
AcceleratorPedal,timeTillLast,0.027554
BarometricPressure,timeTillLast,0.00983
EngineRpm,timeTillLast,0.007826
IntakeManifoldTemperature,timeTillLast,0.004596
EngineLoad,timeTillLast,0.004155


Run Chi2 on spn, fmi, and spn-fmi against the target (48 hours)

In [76]:
chi2_contingency(pd.crosstab(X['target'], X['spn']))

(59554.40104889869,
 0.0,
 393,
 array([[6.23698262e+02, 1.19178012e+01, 1.23150612e+02, 2.97945029e+00,
         2.56034095e+03, 9.93150098e+00, 2.58219026e+01, 1.06366376e+03,
         1.78767018e+01, 9.93150098e+00, 1.19178012e+01, 9.93150098e+00,
         3.14828581e+02, 3.91301139e+02, 9.93150098e-01, 2.38058079e+03,
         5.95890059e+00, 3.16814881e+02, 1.08253361e+02, 4.48218571e+04,
         1.11788975e+04, 1.31095813e+02, 5.16438051e+02, 8.12396780e+02,
         4.15136741e+02, 2.78082028e+01, 1.48972515e+01, 9.93150098e+00,
         6.83287268e+02, 1.28867184e+05, 9.93150098e+00, 9.93150098e-01,
         3.32705283e+02, 6.95205069e+00, 9.66335046e+02, 9.93150098e+00,
         1.98630020e+00, 6.95205069e+00, 6.49520164e+02, 1.83732768e+03,
         2.65171076e+02, 1.58904016e+01, 2.13527271e+02, 8.93835088e+00,
         1.32088963e+02, 4.96575049e+01, 1.68835517e+01, 1.68835517e+01,
         1.78767018e+01, 2.70136827e+02, 2.97945029e+00, 4.88629848e+02,
         9.93150098

In [77]:
chi2_contingency(pd.crosstab(X['target'], X['fmi']))

(4367.790894715339,
 0.0,
 24,
 array([[9.60078200e+03, 7.06824925e+03, 2.69858745e+04, 9.13896720e+04,
         1.46191694e+04, 2.61794366e+03, 4.92602449e+02, 8.88968653e+03,
         2.42626569e+03, 8.90736460e+04, 2.82948463e+03, 1.59897166e+03,
         1.83037563e+03, 6.70376316e+02, 3.52667600e+03, 7.88461863e+03,
         3.48397054e+03, 1.10693531e+05, 2.36578285e+04, 2.42527254e+03,
         1.93465639e+03, 3.67465536e+01, 1.98630020e+00, 1.93664269e+02,
         1.50859500e+04],
        [6.62180000e+01, 4.87507506e+01, 1.86125530e+02, 6.30327957e+02,
         1.00830553e+02, 1.80563410e+01, 3.39755126e+00, 6.13134704e+01,
         1.67343099e+01, 6.14353986e+02, 1.95153700e+01, 1.10283418e+01,
         1.26243689e+01, 4.62368367e+00, 2.43240010e+01, 5.43813698e+01,
         2.40294553e+01, 7.63469497e+02, 1.63171509e+02, 1.67274600e+01,
         1.33436086e+01, 2.53446364e-01, 1.36998035e-02, 1.33573084e+00,
         1.04050007e+02]]))

In [78]:
chi2_contingency(pd.crosstab(X['target'], X['spn-fmi']))

(66306.83731219656,
 0.0,
 883,
 array([[6.23698262e+02, 1.19178012e+01, 6.85273568e+01, ...,
         4.11958661e+03, 1.88698519e+01, 1.39041014e+01],
        [4.30173829e+00, 8.21988208e-02, 4.72643219e-01, ...,
         2.84133924e+01, 1.30148133e-01, 9.58986242e-02]]))

Here let's look at those ANOVA effect sizes.

In [79]:
# Stolen from https://stats.stackexchange.com/questions/67926/understanding-the-one-way-anova-effect-size-in-scipy#answer-138166

import scipy.special as special
def FPvalue( *args):
    """ Return F and p value

    """
    df_btwn, df_within = __degree_of_freedom_( *args)

    mss_btwn = __ss_between_( *args) / float( df_btwn)   
    mss_within = __ss_within_( *args) / float( df_within)

    F = mss_btwn / mss_within    
    P = special.fdtrc( df_btwn, df_within, F)

    return( F, P)

def EffectSize( *args):
    """ Return the eta squared as the effect size for ANOVA

    """    
    return( float( __ss_between_( *args) / __ss_total_( *args)))

def __concentrate_( *args):
    """ Concentrate input list-like arrays

    """
    v = list( map( np.asarray, args))
    vec = np.hstack( np.concatenate( v))
    return( vec)

def __ss_total_( *args):
    """ Return total of sum of square

    """
    vec = __concentrate_( *args)
    ss_total = sum( (vec - np.mean( vec)) **2)
    return( ss_total)

def __ss_between_( *args):
    """ Return between-subject sum of squares

    """    
    # grand mean
    grand_mean = np.mean( __concentrate_( *args))

    ss_btwn = 0
    for a in args:
        ss_btwn += ( len(a) * ( np.mean( a) - grand_mean) **2)

    return( ss_btwn)

def __ss_within_( *args):
    """Return within-subject sum of squares

    """
    return( __ss_total_( *args) - __ss_between_( *args))

def __degree_of_freedom_( *args):
    """Return degree of freedom

       Output-
              Between-subject dof, within-subject dof
    """   
    args = list( map( np.asarray, args))
    # number of groups minus 1
    df_btwn = len( args) - 1

    # total number of samples minus number of groups
    df_within = len( __concentrate_( *args)) - df_btwn - 1

    return( df_btwn, df_within)

In [113]:
X['howsMyDriving'] = (X['FuelLtd'] / (1+X['EngineTimeLtd'])) *   X['DistanceLtd']
X['howsMyDriving'].isna().sum()
EffectSize(*[X[X['target'] == bool]['howsMyDriving'] for bool in [True, False]])

In [116]:
es = []
f_1way = []
for col in list(numcols):
    print(col)
    es.append(EffectSize(*[X[X['target'] == bool][col] for bool in [True, False]]))
    f_1way.append(f_oneway(*[X[X['target'] == bool][col] for bool in [True, False]]))


activeTransitionCount
Latitude
Longitude
AcceleratorPedal
BarometricPressure
CruiseControlActive
CruiseControlSetSpeed
DistanceLtd
EngineCoolantTemperature
EngineLoad
EngineOilPressure
EngineOilTemperature
EngineRpm
EngineTimeLtd
FuelLevel
FuelLtd
FuelRate
IgnStatus
IntakeManifoldTemperature
Speed
TurboBoostPressure
timeTillLast
target



Each of the input arrays is constant;the F statistic is not defined or infinite



In [117]:
numcol_effects = pd.DataFrame({
    'variable':list(numcols),
    'effect_size':es,
    'f_oneway':f_1way
})

In [118]:
numcol_effects.sort_values('effect_size', ascending = False)

Unnamed: 0,variable,effect_size,f_oneway
22,target,1.0,"(inf, 0.0)"
21,timeTillLast,0.008223193,"(3581.6666887324945, 0.0)"
0,activeTransitionCount,0.004016613,"(1742.0735134586241, 0.0)"
6,CruiseControlSetSpeed,0.0006882601,"(297.51593069150255, 1.2060369005872717e-66)"
12,EngineRpm,0.000170394,"(73.61848330309188, 9.507880727315533e-18)"
10,EngineOilPressure,0.0001271138,"(54.91694636544535, 1.2595978402941112e-13)"
15,FuelLtd,0.0001129037,"(48.777085096165145, 2.871819861860748e-12)"
13,EngineTimeLtd,8.618223e-05,"(37.23177739398284, 1.0497920774442798e-09)"
4,BarometricPressure,7.439377e-05,"(32.138640319681784, 1.4364470548206347e-08)"
8,EngineCoolantTemperature,7.095764e-05,"(30.654101826040183, 3.085525417155592e-08)"


In [96]:
numcol_effects['effect_size'].sum()

1.0139483304684382

## Check out Chris H's weather data

In [81]:
cjh_faults = pd.read_csv('../data/CJH_big_G.csv')


Columns (0,33) have mixed types.Specify dtype option on import or set low_memory=False.



In [85]:
full48.dtypes[:20]

RecordID                          int64
ESS_Id                            int64
EventTimeStamp           datetime64[ns]
eventDescription                 object
actionDescription               float64
ecuSoftwareVersion               object
ecuSerialNumber                  object
ecuModel                         object
ecuMake                          object
ecuSource                         int64
spn                               int64
fmi                               int64
active                             bool
activeTransitionCount             int64
faultValue                      float64
EquipmentID                      object
MCTNumber                         int64
Latitude                        float64
Longitude                       float64
LocationTimeStamp        datetime64[ns]
dtype: object

In [121]:
cjh_faults.dtypes[:20]

equipment_id                   object
event_time_stamp               object
latitude                      float64
longitude                     float64
location_time_stamp            object
derate_status                  object
accelerator_pedal             float64
barometric_pressure           float64
cruise_control_active            bool
cruise_control_set_speed      float64
ltd_distance                  float64
engine_coolant_temperature    float64
engine_load                   float64
engine_oil_pressure           float64
engine_oil_temperature        float64
engine_rpm                    float64
ltd_engine_time               float64
fuel_level                    float64
ltd_fuel                      float64
fuel_rate                     float64
dtype: object

In [122]:
cjh_faults.dtypes[20:39]

fuel_temperature               float64
ignition_status                   bool
intake_manifold_temperature    float64
lamp_status                      int64
parking_brake                     bool
service_distance               float64
speed                          float64
switched_battery_voltage       float64
throttle                       float64
turbo_boost_pressure           float64
ess_id                           int64
event_description               object
ecu_software_version            object
ecu_serial_number               object
ecu_model                       object
ecu_make                        object
ecu_source                       int64
spn                              int64
fmi                              int64
dtype: object