In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [2]:
bigg = pd.read_csv('./data/CJH_big_g_express_dataset_042322_420PM.csv', low_memory = False)

In [3]:
bigg['event_time_stamp'] = pd.to_datetime(bigg['event_time_stamp'])

bigg = bigg.loc[(bigg['event_time_stamp'].dt.year > 2014) & (bigg['event_time_stamp'].dt.year <= 2020)]

In [4]:
def create_groups(grp, df, offset_grp = None):
    df['full_derate'] = False
    
    if offset_grp is not None:
        df = df[df[f'use_for_first_tow_hours_id_group_{offset_grp}'] == False]
        
    df['full_derate'] = (
        np.where(df[f'hours_id_group_{grp}']\
                 .isin(list(df[f'hours_id_group_{grp}']\
                            .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)])) &
                 (df[f'use_for_first_tow_hours_id_group_{grp}'] == True),
                 True,
                 False)
        )
    
    
    mask = df.loc[(df[f'hours_id_group_{grp}']\
                    .isin(list(df[f'hours_id_group_{grp}']\
                               .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)]))) &
                    (df['full_derate'] == False)]
    
    row_ls = [i for i in df.index if i not in mask.index]
    
    df = df.loc[row_ls]
    
    grp_col = f'hours_id_group_{grp}'
    
    return df, grp_col

In [5]:
bigg, grp_col = create_groups('48H', bigg)

In [6]:
bigg['common_derate_lamp_status'] = (
    np.where((bigg['lamp_status'] == 22527) | (bigg['lamp_status'] == 18431), True, False)
)

bigg['fuel_temperature_over_32'] = (
    np.where(bigg['fuel_temperature'] > 32, True, False)
)

bigg['accelerator_pedal_over_0'] = (
    np.where(bigg['accelerator_pedal'] > 0, True, False)
)

bigg['cruise_control_set_speed_under_66'] = (
    np.where(bigg['cruise_control_set_speed'] < 66, True, False)
)

bigg['switched_battery_voltage_less_than_3276.75'] = (
    np.where(bigg['switched_battery_voltage'] < 3276.75, True, False)

)

bigg['event_description_SCR_related'] = (
    np.where(bigg['event_description'].str.lower().str.contains('catalyst|aftertreatment|nox|derate'), True, False)
)

In [7]:
cat_var = [
    'common_derate_lamp_status',
    'ecu_model',
    'ecu_make',
    'error_category',
    'month',
    'maintenance_before',
    'fuel_temperature_over_32',
    'accelerator_pedal_over_0',
    'cruise_control_set_speed_under_66',
    'switched_battery_voltage_less_than_3276.75',
    'event_description_SCR_related']

normalizers = ['ltd_distance',
             'ltd_engine_time',
             'ltd_fuel']

var = ['barometric_pressure',
       'ltd_distance',
       'engine_coolant_temperature',
       'engine_load',
       'engine_oil_pressure',
       'engine_oil_temperature',
       'engine_rpm',
       'ltd_engine_time',
       'fuel_level',
       'ltd_fuel',
       'fuel_rate',
       'intake_manifold_temperature',
       'speed',
       'turbo_boost_pressure',
       'error_duration_(minutes)'
      ]

In [8]:
ohe_col_ls = []
for i in cat_var:
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(bigg[[i]])
    bigg[ohe.get_feature_names_out()] = transformed.toarray()
    ohe_col_ls += [i for i in ohe.get_feature_names_out()]

In [9]:
def log_me(col):
    bigg[col] = (
        np.where((bigg[col].notnull()) & 
                 (np.log(bigg[col]) != -np.inf) &
                 (np.log(bigg[col]) != np.inf), 
                 np.log(bigg[col]), np.nan)
    )
    
    return bigg[col]

log_cols = ['engine_oil_temperature',
            'ltd_engine_time',
            'fuel_rate',
            'turbo_boost_pressure',
            'speed',
            'error_duration_(minutes)']

bigg['barometric_pressure'] = np.exp(bigg['barometric_pressure'])

for i in log_cols:
    bigg[i] = log_me(i)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
# def normalize_me(col, normalizer):
#     if col != normalizer:
#         bigg[col] = (
#             np.where((bigg[col].notnull()) & 
#                      (bigg[col]/bigg[normalizer] != -np.inf) &
#                      (bigg[col]/bigg[normalizer] != np.inf), 
#                      bigg[col]/bigg[normalizer] * 100000, 
#                      np.nan))
    
#     return bigg[col]

# normalized_var_ls = []

# for i in var:
#     for j in normalizers:
#         col_name = f'{i}_{j}'
#         bigg[col_name] = normalize_me(i, j)
#         normalized_var_ls.append(col_name)

In [11]:
cat_vars = bigg.groupby(grp_col)[ohe_col_ls].sum().reset_index()

In [12]:
mean_vars = bigg.groupby(grp_col)[var].mean().reset_index().fillna(0)
std_vars = bigg.groupby(grp_col)[var].std().reset_index().fillna(0)

In [13]:
target_values = bigg[[grp_col, 'full_derate']].drop_duplicates()

In [14]:
all_vars = pd.merge(mean_vars, std_vars, on = grp_col).merge(cat_vars, on = grp_col)

In [15]:
X = all_vars.drop(columns = grp_col)
y = target_values['full_derate']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    train_size = 0.70, 
                                                    random_state = 111, 
                                                    stratify = y)

In [16]:
pipe = Pipeline(
    steps = [
        ('smt', SMOTE(n_jobs = -1)),
        ('scale', StandardScaler()),
        ('boost', XGBClassifier(verbosity = 2))
    ]
)

In [17]:
cv = cross_val_score(estimator = pipe,
                     X = X_train,
                     y = y_train, 
                     cv = 5,
                     n_jobs = -1,
                     scoring = 'f1', 
                     verbose = 2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.8min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished


In [18]:
cv

array([0.52173913, 0.35971223, 0.4962406 , 0.47619048, 0.5248227 ])

In [19]:
pipe.fit(X_train, y_train)



[19:22:36] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 82 extra nodes, 0 pruned nodes, max_depth=6
[19:22:37] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 86 extra nodes, 0 pruned nodes, max_depth=6
[19:22:37] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 80 extra nodes, 0 pruned nodes, max_depth=6
[19:22:37] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 88 extra nodes, 0 pruned nodes, max_depth=6
[19:22:37] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 86 extra nodes, 0 pruned nodes, max_depth=6
[19:22:38] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 82 extra nodes, 0 pruned nodes, max_depth=6
[19:22:38] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 88 extra nodes, 0 pruned nodes, max_depth=6
[19:22:38] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 78 extra nodes, 0 pruned nodes, max_depth=6
[19:22:39] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 76 extra nodes, 0 pruned no

[19:22:55] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 64 extra nodes, 0 pruned nodes, max_depth=6
[19:22:55] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 38 extra nodes, 0 pruned nodes, max_depth=6
[19:22:55] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 50 extra nodes, 0 pruned nodes, max_depth=6
[19:22:56] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 58 extra nodes, 0 pruned nodes, max_depth=6
[19:22:56] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 50 extra nodes, 0 pruned nodes, max_depth=6
[19:22:56] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 32 extra nodes, 0 pruned nodes, max_depth=6
[19:22:56] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=6
[19:22:57] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 50 extra nodes, 0 pruned nodes, max_depth=6
[19:22:57] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 84 extra nodes, 0 pruned no

Pipeline(steps=[('smt', SMOTE(n_jobs=-1)), ('scale', StandardScaler()),
                ('boost',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=12, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               ve

In [20]:
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     35526
        True       0.61      0.50      0.55       165

    accuracy                           1.00     35691
   macro avg       0.80      0.75      0.77     35691
weighted avg       1.00      1.00      1.00     35691



In [21]:
print(confusion_matrix(y_test, pipe.predict(X_test)))

[[35473    53]
 [   82    83]]


In [22]:
importances = pd.DataFrame({
    'variable': X_train.columns,
    'importance': pipe['boost'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(10)

Unnamed: 0,variable,importance
69,error_category_SCR-Related,0.462671
87,accelerator_pedal_over_0_True,0.063697
85,fuel_temperature_over_32_True,0.039064
31,common_derate_lamp_status_True,0.034841
66,ecu_make_Unknown,0.032636
91,switched_battery_voltage_less_than_3276.75_True,0.027404
46,ecu_model_EC60-adv,0.02502
1,ltd_distance_x,0.023595
83,maintenance_before_True,0.021423
7,ltd_engine_time_x,0.019931


In [23]:
import pickle

with open('./data/boost_42422.pkl', 'wb') as file:
    pickle.dump(pipe, file = file)