In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [2]:
bigg = pd.read_csv('./data/CJH_big_g_express_dataset_042622_817AM.csv', low_memory = False)

In [3]:
bigg['event_time_stamp'] = pd.to_datetime(bigg['event_time_stamp'])

bigg = bigg.loc[(bigg['event_time_stamp'].dt.year > 2014) & (bigg['event_time_stamp'].dt.year <= 2020)]

In [4]:
def create_groups(grp, df, offset_grp = None):
    df['full_derate'] = False
    
    if offset_grp is not None:
        df = df[df[f'use_for_first_tow_hours_id_group_{offset_grp}'] == False]
        
    df['full_derate'] = (
        np.where(df[f'hours_id_group_{grp}']\
                 .isin(list(df[f'hours_id_group_{grp}']\
                            .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)])) &
                 (df[f'use_for_first_tow_hours_id_group_{grp}'] == True),
                 True,
                 False)
        )
    
    
    mask = df.loc[(df[f'hours_id_group_{grp}']\
                    .isin(list(df[f'hours_id_group_{grp}']\
                               .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)]))) &
                    (df['full_derate'] == False)]
    
    row_ls = [i for i in df.index if i not in mask.index]
    
    df = df.loc[row_ls]
    
    grp_col = f'hours_id_group_{grp}'
    
    return df, grp_col

In [5]:
bigg_train, grp_col_train = create_groups('5D', bigg)

In [6]:
bigg_test, grp_col_test = create_groups('5D', bigg, '1H')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['full_derate'] = (


In [7]:
def create_X_and_y_from_bigg(df, grp_col, training = False):    
    df['common_derate_lamp_status'] = (
        np.where((df['lamp_status'] == 22527) | (df['lamp_status'] == 18431), True, False)
    )

    df['fuel_temperature_over_32'] = (
        np.where(df['fuel_temperature'] > 32, True, False)
    )

    df['accelerator_pedal_over_0'] = (
        np.where(df['accelerator_pedal'] > 0, True, False)
    )

    df['cruise_control_set_speed_under_66'] = (
        np.where(df['cruise_control_set_speed'] < 66, True, False)
    )

    df['switched_battery_voltage_less_than_3276.75'] = (
        np.where(df['switched_battery_voltage'] < 3276.75, True, False)

    )


    cat_var = [
        'common_derate_lamp_status',
        'ecu_model',
        'ecu_make',
        'error_category',
        'month',
        'fuel_temperature_over_32',
        'accelerator_pedal_over_0',
        'cruise_control_set_speed_under_66',
        'switched_battery_voltage_less_than_3276.75'
    ]

    normalizers = ['ltd_distance',
                 'ltd_engine_time',
                 'ltd_fuel']

    var = ['barometric_pressure',
           'ltd_distance',
           'engine_coolant_temperature',
           'engine_load',
           'engine_oil_pressure',
           'engine_oil_temperature',
           'engine_rpm',
           'ltd_engine_time',
           'fuel_level',
           'ltd_fuel',
           'fuel_rate',
           'intake_manifold_temperature',
           'speed',
           'turbo_boost_pressure',
           'error_duration_(minutes)'
          ]


    ohe_col_ls = []
    for i in cat_var:
        ohe = OneHotEncoder()
        transformed = ohe.fit_transform(df[[i]])
        df[ohe.get_feature_names_out()] = transformed.toarray()
        ohe_col_ls += [i for i in ohe.get_feature_names_out()]

    def log_me(col):
        df[col] = (
            np.where((df[col].notnull()) & 
                     (np.log(df[col]) != -np.inf) &
                     (np.log(df[col]) != np.inf), 
                     np.log(df[col]), np.nan)
        )

        return df[col]

    log_cols = ['engine_oil_temperature',
                'ltd_engine_time',
                'fuel_rate',
                'turbo_boost_pressure',
                'speed',
                'error_duration_(minutes)']

    df['barometric_pressure'] = np.exp(df['barometric_pressure'])

    for i in log_cols:
        df[i] = log_me(i)

    cat_vars = df.groupby(grp_col)[ohe_col_ls].sum().reset_index()

    mean_vars = df.groupby(grp_col)[var].mean().reset_index().fillna(0)
    std_vars = df.groupby(grp_col)[var].std().reset_index().fillna(0)

    target_values = df[[grp_col, 'full_derate']].drop_duplicates()

    all_vars = pd.merge(mean_vars, std_vars, on = grp_col).merge(cat_vars, on = grp_col)
    
    if training == True:
        all_groups = pd.merge(all_vars, target_values, on = grp_col)
        all_groups_true = all_groups.loc[all_groups['full_derate'] == True]
        all_groups_resampled_false = all_groups.loc[all_groups['full_derate'] == False].sample(frac = 0.8)
        all_groups = pd.concat([all_groups_true, all_groups_resampled_false])
        X = all_groups.drop(columns = [grp_col,'full_derate'])
        y = all_groups['full_derate']
    else:
        X = all_vars.drop(columns = grp_col)
        y = target_values['full_derate']

    return X, y

In [8]:
train_X, train_y = create_X_and_y_from_bigg(bigg_train, grp_col_train, training = True)
test_X, test_y = create_X_and_y_from_bigg(bigg_test, grp_col_test, training = False)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
X_train, _, y_train, _ = train_test_split(train_X, train_y,
                                       test_size = 0.30, 
                                       train_size = 0.70, 
                                       random_state = 111,
                                       stratify = train_y)

_, X_test, _, y_test = train_test_split(test_X, test_y, 
                                      test_size = 0.30, 
                                      train_size = 0.70, 
                                      random_state = 111, 
                                      stratify = test_y)

In [10]:
pipe = Pipeline(
    steps = [
        ('smt', SMOTE(n_jobs = -1)),
        ('scale', StandardScaler()),
        ('boost', XGBClassifier(verbosity = 2, n_jobs = -1))
    ]
)

In [11]:
cv = cross_val_score(estimator = pipe,
                     X = X_train,
                     y = y_train, 
                     cv = 5,
                     n_jobs = -1,
                     scoring = 'f1', 
                     verbose = 2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   53.8s remaining:   35.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   54.3s finished


In [12]:
cv

array([0.58823529, 0.58333333, 0.54961832, 0.49180328, 0.57971014])

In [13]:
pipe.fit(X_train, y_train)



[08:44:15] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 64 extra nodes, 0 pruned nodes, max_depth=6
[08:44:15] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 64 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 66 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 74 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 80 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=6
[08:44:16] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 66 extra nodes, 0 pruned no

[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 40 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 38 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 30 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 44 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 66 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 44 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 42 extra nodes, 0 pruned nodes, max_depth=6
[08:44:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 56 extra nodes, 0 pruned nodes, max_depth=6
[08:44:27] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 32 extra nodes, 0 pruned no

Pipeline(steps=[('smt', SMOTE(n_jobs=-1)), ('scale', StandardScaler()),
                ('boost',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=-1, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               ve

In [14]:
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     22039
        True       0.58      0.44      0.50       103

    accuracy                           1.00     22142
   macro avg       0.79      0.72      0.75     22142
weighted avg       1.00      1.00      1.00     22142



In [15]:
print(confusion_matrix(y_test, pipe.predict(X_test)))

[[22006    33]
 [   58    45]]


In [16]:
importances = pd.DataFrame({
    'variable': X_train.columns,
    'importance': pipe['boost'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(10)

Unnamed: 0,variable,importance
68,error_category_SCR-Related,0.405179
88,switched_battery_voltage_less_than_3276.75_True,0.15562
31,common_derate_lamp_status_True,0.061178
7,ltd_engine_time_x,0.040818
6,engine_rpm_x,0.027323
78,month_November,0.017709
82,fuel_temperature_over_32_True,0.017597
65,ecu_make_Unknown,0.016212
61,ecu_make_BNDWS,0.014563
16,ltd_distance_y,0.014247


In [17]:
import pickle

with open('./data/boost_42622.pkl', 'wb') as file:
    pickle.dump(pipe, file = file)