In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline

In [2]:
bigg = pd.read_csv('./data/CJH_big_g_express_dataset_041722_1054AM.csv', low_memory = False)

In [3]:
bigg['event_time_stamp'] = pd.to_datetime(bigg['event_time_stamp'])

bigg = bigg.loc[(bigg['event_time_stamp'].dt.year > 2014) & (bigg['event_time_stamp'].dt.year <= 2020)]

In [4]:
def create_groups(grp, df):
    df['full_derate'] = False
    df['full_derate'] = (
        np.where(df[f'hours_id_group_{grp}']\
                    .isin(list(df[f'hours_id_group_{grp}']\
                               .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)])) &
                 (df[f'use_for_first_tow_hours_id_group_{grp}'] == True),
                 True,
                 False)
        )
    
    
    mask = df.loc[(df[f'hours_id_group_{grp}']\
                    .isin(list(df[f'hours_id_group_{grp}']\
                               .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)]))) &
                    (df[f'use_for_first_tow_hours_id_group_{grp}'] == False)]
    
    row_ls = [i for i in df.index if i not in mask.index]
    
    df = df.loc[row_ls]
    
    grp_col = f'hours_id_group_{grp}'
    
    return df, grp_col

In [5]:
bigg, grp_col = create_groups('14H', bigg)

In [6]:
bigg['common_derate_lamp_status'] = (
    np.where((bigg['lamp_status'] == 22527) | (bigg['lamp_status'] == 18431), True, False)
)

In [7]:
cat_var = ['common_derate_lamp_status',
           'ecu_model',
           'ecu_make',
           'error_category',
           'month']

# normalizers = ['ltd_distance',
#              'ltd_engine_time',
#              'ltd_fuel']

var = [
    'barometric_pressure',
    'ltd_distance',
    'engine_coolant_temperature',
    'engine_load',
    'engine_oil_pressure',
    'engine_oil_temperature',
    'engine_rpm',
    'ltd_engine_time',
    'fuel_level',
    'ltd_fuel',
    'fuel_rate',
    'intake_manifold_temperature',
    'speed',
    'turbo_boost_pressure',
    'error_duration_(minutes)']

In [8]:
#        'daily_avg_air_temp_for_location_(C)',
#        'daily_min_air_temp_for_location_(C)',
#        'daily_max_air_temp_for_location_(C)',
#        'daily_precipitation_total_(mm)',
#        'daily_snow_depth_(mm)',
#        'daily_avg_wind_direction_(deg)',
#        'daily_avg_wind_speed_(km/h)',
#        'daily_peak_wind_gust_(km/h)',
#     'daily_avg_sea_level_air_pressure_(hPa)',
#        'daily_sunshine_total_(minutes)'

In [9]:
ohe_col_ls = []
for i in cat_var:
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(bigg[[i]])
    bigg[ohe.get_feature_names_out()] = transformed.toarray()
    ohe_col_ls += [i for i in ohe.get_feature_names_out()]

In [10]:
def log_me(col):
    bigg[col] = (
        np.where((bigg[col].notnull()) & 
                 (np.log(bigg[col]) != -np.inf) &
                 (np.log(bigg[col]) != np.inf), 
                 np.log(bigg[col]), np.nan)
    )
    
    return bigg[col]

log_cols = ['engine_oil_temperature',
            'ltd_engine_time',
            'fuel_rate',
            'turbo_boost_pressure',
#             'daily_precipitation_total_(mm)',
#             'daily_snow_depth_(mm)',
#             'daily_peak_wind_gust_(km/h)',
#             'daily_sunshine_total_(minutes)',
            'speed',
            'error_duration_(minutes)']

bigg['barometric_pressure'] = np.exp(bigg['barometric_pressure'])
# bigg['daily_sunshine_total_(minutes)'] = np.exp(bigg['daily_sunshine_total_(minutes)'])

In [11]:
# def normalize_me(col, normalizer):
#     if col != normalizer:
#         bigg[col] = (
#             np.where((bigg[col].notnull()) & 
#                      (bigg[col]/bigg[normalizer] != -np.inf) &
#                      (bigg[col]/bigg[normalizer] != np.inf), 
#                      bigg[col]/bigg[normalizer] * 1000, 
#                      np.nan))
    
#     return bigg[col]

# normalized_var_ls = []

# for i in var:
#     for j in normalizers:
#         col_name = f'{i}_{j}'
#         bigg[col_name] = normalize_me(i, j)
#         normalized_var_ls.append(col_name)

In [12]:
cat_vars = bigg.groupby(grp_col)[ohe_col_ls].sum().reset_index()

In [13]:
mean_vars = bigg.groupby(grp_col)[var].mean().reset_index().fillna(0)
std_vars = bigg.groupby(grp_col)[var].std().reset_index().fillna(0)

In [14]:
target_values = bigg[[grp_col, 'full_derate']].drop_duplicates()

In [15]:
all_vars = pd.merge(mean_vars, std_vars, on = grp_col).merge(cat_vars, on = grp_col)

In [16]:
X = all_vars.drop(columns = grp_col)
y = target_values['full_derate']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    train_size = 0.80, 
                                                    random_state = 100, 
                                                    stratify = y)

In [17]:
pipe = Pipeline(
    steps = [
        ('scale', StandardScaler()),
        ('logistic', LogisticRegression(class_weight = {0:1, 1:50},
                                        max_iter = 1000, 
                                        n_jobs = -1, 
                                        verbose = 2))
    ]
)


cv = cross_val_score(estimator = pipe,
                     X = X_train,
                     y = y_train, 
                     cv = 5,
                     n_jobs = -1,
                     scoring = 'f1_macro', 
                     verbose = 2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   23.1s remaining:   15.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   24.6s finished


In [18]:
cv

array([0.57034332, 0.56155453, 0.57914844, 0.55124918, 0.5768975 ])

In [19]:
pipe.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.8min finished


Pipeline(steps=[('scale', StandardScaler()),
                ('logistic',
                 LogisticRegression(class_weight={0: 1, 1: 50}, max_iter=1000,
                                    n_jobs=-1, verbose=2))])

In [20]:
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99     41685
        True       0.10      0.68      0.17       137

    accuracy                           0.98     41822
   macro avg       0.55      0.83      0.58     41822
weighted avg       1.00      0.98      0.99     41822



In [21]:
print(confusion_matrix(y_test, pipe.predict(X_test)))

[[40821   864]
 [   44    93]]
