In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
bigg = pd.read_csv('./data/CJH_big_g_express_dataset_042322_1010AM.csv', low_memory = False)

In [None]:
bigg['event_time_stamp'] = pd.to_datetime(bigg['event_time_stamp'])

bigg = bigg.loc[(bigg['event_time_stamp'].dt.year > 2014) & (bigg['event_time_stamp'].dt.year <= 2020)]

In [None]:
def create_groups(grp, df, offset_grp = None):
    df['full_derate'] = False
    
    if offset_grp is not None:
        df = df[df[f'use_for_first_tow_hours_id_group_{offset_grp}'] == False]
        
    df['full_derate'] = (
        np.where(df[f'hours_id_group_{grp}']\
                 .isin(list(df[f'hours_id_group_{grp}']\
                            .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)])) &
                 (df[f'use_for_first_tow_hours_id_group_{grp}'] == True),
                 True,
                 False)
        )
    
    
    mask = df.loc[(df[f'hours_id_group_{grp}']\
                    .isin(list(df[f'hours_id_group_{grp}']\
                               .loc[(df[f'use_for_first_tow_hours_id_group_{grp}'] == True)]))) &
                    (df['full_derate'] == False)]
    
    row_ls = [i for i in df.index if i not in mask.index]
    
    df = df.loc[row_ls]
    
    grp_col = f'hours_id_group_{grp}'
    
    return df, grp_col

In [None]:
bigg, grp_col = create_groups('48H', bigg, '1H')

In [None]:
bigg['common_derate_lamp_status'] = (
    np.where((bigg['lamp_status'] == 22527) | (bigg['lamp_status'] == 18431), True, False)
)

In [None]:
cat_var = [
    'common_derate_lamp_status',
    'ecu_model',
    'ecu_make',
    'error_category',
    'month',
    'maintenance_before']

var = ['barometric_pressure',
       'ltd_distance',
       'engine_coolant_temperature',
       'engine_load',
       'engine_oil_pressure',
       'engine_oil_temperature',
       'engine_rpm',
       'ltd_engine_time',
       'fuel_level',
       'ltd_fuel',
       'fuel_rate',
       'intake_manifold_temperature',
       'speed',
       'turbo_boost_pressure',
       'error_duration_(minutes)']

In [None]:
ohe_col_ls = []
for i in cat_var:
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(bigg[[i]])
    bigg[ohe.get_feature_names_out()] = transformed.toarray()
    ohe_col_ls += [i for i in ohe.get_feature_names_out()]

In [None]:
def log_me(col):
    bigg[col] = (
        np.where((bigg[col].notnull()) & 
                 (np.log(bigg[col]) != -np.inf) &
                 (np.log(bigg[col]) != np.inf), 
                 np.log(bigg[col]), np.nan)
    )
    
    return bigg[col]

log_cols = ['engine_oil_temperature',
            'ltd_engine_time',
            'fuel_rate',
            'turbo_boost_pressure',
            'speed',
            'error_duration_(minutes)']

bigg['barometric_pressure'] = np.exp(bigg['barometric_pressure'])

In [None]:
cat_vars = bigg.groupby(grp_col)[ohe_col_ls].sum().reset_index()

In [None]:
mean_vars = bigg.groupby(grp_col)[var].mean().reset_index().fillna(0)
std_vars = bigg.groupby(grp_col)[var].std().reset_index().fillna(0)

In [None]:
target_values = bigg[[grp_col, 'full_derate']].drop_duplicates()

In [None]:
all_vars = pd.merge(mean_vars, std_vars, on = grp_col).merge(cat_vars, on = grp_col)

In [None]:
# X = cat_vars.drop(columns = grp_col)
X = all_vars.drop(columns = grp_col)
y = target_values['full_derate']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    train_size = 0.80, 
                                                    random_state = 36, 
                                                    stratify = y)

In [None]:
param_grid = {'smt__sampling_strategy': np.arange(0.1,1,0.1).tolist()}


pipe = Pipeline(
    steps = [
        ('smt', SMOTE(n_jobs = -1)),
        ('scale', StandardScaler()),
        ('rf', RandomForestClassifier(n_estimators = 100, verbose = 2)) 
    ]
)

gs = GridSearchCV(estimator = pipe, 
                  param_grid = param_grid,
                  scoring = 'f1',
                  cv = 5)

# cv = cross_val_score(estimator = pipe,
#                      X = X_train,
#                      y = y_train, 
#                      cv = 5,
#                      n_jobs = -1,
#                      scoring = 'precision', 
#                      verbose = 2)

In [None]:
# cv

In [None]:
# cv = cross_val_score(estimator = pipe,
#                      X = X_train,
#                      y = y_train, 
#                      cv = 5,
#                      n_jobs = -1,
#                      scoring = 'recall', 
#                      verbose = 2)

In [None]:
# cv

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
print(classification_report(y_test, gs.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, gs.predict(X_test)))

In [None]:
importances = pd.DataFrame({
    'variable': X_train.columns,
    'importance': gs['rf'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(10)