# imports

In [37]:
#manipulação de dados
import pandas as pd
import numpy  as np

#visualização
import matplotlib.pyplot as plt
import seaborn           as sns
from IPython.core.display import HTML
from IPython.display      import Image

# processamento de dados
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

#machine learning models
import xgboost as xgb

#metricas
from sklearn.metrics import accuracy_score, balanced_accuracy_score,precision_score,recall_score,roc_auc_score, confusion_matrix,f1_score

## Helper functions 

In [38]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [39]:
def mult_metrics(model_name,y,yhat):
    return pd.DataFrame({'mode_name':model_name,
             'precison_multclass':precision_score(y,yhat,average='macro'),
             'recall_multclass':recall_score(y,yhat,average='macro'),
              'f1-score_multclass':f1_score(y,yhat,average='macro'),
             'balanced_score':balanced_accuracy_score(y,yhat)},index=[0])
    

In [40]:
def confusion_m(y,yhat):
    from sklearn.metrics import confusion_matrix
    ax= plt.subplot()
    cm = confusion_matrix(y,yhat)
    sns.heatmap(cm,annot=True,ax=ax,annot_kws={'size': 20})
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']); ax.yaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']);
    plt.yticks(rotation=0) ;

In [41]:
def performace_cross_val(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        model.fit(X.iloc[train_index], y.iloc[train_index])

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])


In [42]:
def performace_cross_val_boost(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        weight = class_weight.compute_sample_weight(class_weight='balanced',y=y.iloc[train_index])
        model.fit(X.iloc[train_index], y.iloc[train_index],sample_weight=weight)

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])

# Load data

In [43]:
path_local = 'C:/Users/Lavin/Documents/desafios/desafio_indicium/'
df4 = pd.read_csv(path_local + 'data/processed/df_train_processed.csv')

In [44]:
df4.head()

Unnamed: 0,product_id,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,power_w,failure_type
0,M14860,0.304348,0.358025,0.253298,0.191176,-1.0,0.469475,1
1,L47181,0.315217,0.37037,-0.501319,0.448529,-0.972222,0.382571,1
2,L47184,0.315217,0.37037,-0.501319,-0.014706,-0.916667,-0.26392,1
3,M14865,0.304348,0.358025,-0.411609,0.125,-0.898148,-0.017032,1
4,L47186,0.304348,0.358025,0.290237,0.161765,-0.87037,0.445891,1


## Split dataframe

In [45]:
X = df4.copy()
y = X['failure_type']
X = X.drop(columns=['failure_type','product_id'])

In [46]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [47]:
X_train.shape

(4666, 6)

In [48]:
X_val.shape

(2001, 6)

# Fine tuning

In [49]:
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK

import warnings
warnings.filterwarnings('ignore')

In [50]:
# space={'n_estimators': hp.choice('n_estimators',np.arange(300,1000+1,250)),
#       'max_depth': hp.quniform("max_depth", 3, 10, 1),
#       'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#       'min_child_weight' : hp.quniform('min_child_weight', 0, 12, 1),
#       'seed': 42
#     }

In [51]:
# def objective_function(space):
#     xgb_model = xgb.XGBClassifier(n_estimators =int(space['n_estimators']),
#                                   max_depth = int(space['max_depth']), 
#                                   min_child_weight=int(space['min_child_weight']),
#                                   colsample_bytree=int(space['colsample_bytree']),
#                                   seed = space['seed'])      
#     score = performace_cross_val_boost(df4,'failure_type',xgb_model,'xgb_model',round_n=3,splits=3)
#     print(space)
#     print('f1-score:',score['f1_score_cv'][0],'recall:',score['recall_multclass_cv'][0],'precision:',score['precison_multclass_cv'][0])
#     return {'loss': -score['recall_multclass_cv'][0],'status':STATUS_OK}

In [52]:
# tpe_algorithm = tpe.suggest
# trials = Trials()
# num_eval = 100

In [53]:
#best_paramns = fmin(fn=objective_function,space=space,algo=tpe_algorithm,max_evals=num_eval,trials=trials)

In [54]:
# best_paramns

In [57]:
# best_paramns = [{'colsample_bytree': 0.8774451839157555, 'max_depth': 4.0, 'min_child_weight': 2.0, 'n_estimators': 550, 'seed': 42},                                                                
# {'colsample_bytree': 0.8645721620898453, 'max_depth': 3.0, 'min_child_weight': 0.0, 'n_estimators': 550, 'seed': 42}, 
# {'colsample_bytree': 0.9702875370053183, 'max_depth': 3.0, 'min_child_weight': 7.0, 'n_estimators': 550, 'seed': 42},
# {'colsample_bytree': 0.5611699511926991, 'max_depth': 4.0, 'min_child_weight': 3.0, 'n_estimators': 550, 'seed': 42},
# {'colsample_bytree': 0.7336051564716887, 'max_depth': 3.0, 'min_child_weight': 5.0, 'n_estimators': 300, 'seed': 42}]

In [83]:
best_paramns = [{'colsample_bytree': 0.8774451839157555, 'max_depth': 4.0, 'min_child_weight': 2.0, 'n_estimators': 550, 'seed': 42},                                                                
{'colsample_bytree': 0.8645721620898453, 'max_depth': 3.0, 'min_child_weight': 0.0, 'n_estimators': 550, 'seed': 42}, 
{'colsample_bytree': 0.9702875370053183, 'max_depth': 3.0, 'min_child_weight': 7.0, 'n_estimators': 550, 'seed': 42},
{'colsample_bytree': 0.5611699511926991, 'max_depth': 4.0, 'min_child_weight': 3.0, 'n_estimators': 550, 'seed': 42},
{'colsample_bytree': 0.7336051564716887, 'max_depth': 3.0, 'min_child_weight': 5.0, 'n_estimators': 300, 'seed': 42}]

## XGboost

In [59]:
aux_df = pd.DataFrame()
for i in best_paramns:
    best_paramns_select_balance_acc = dict(i)
    xgb_model_final = xgb.XGBClassifier(n_estimators =int(best_paramns_select_balance_acc['n_estimators']),
                                          max_depth = int(best_paramns_select_balance_acc['max_depth']), 
                                          min_child_weight=int(best_paramns_select_balance_acc['min_child_weight']),
                                          colsample_bytree=int(best_paramns_select_balance_acc['colsample_bytree']),
                                          seed = 42).fit(X_train,y_train)
    aux_df = pd.concat([aux_df,performace_cross_val_boost(df4,'failure_type',xgb_model_final,'xgboost')],ignore_index=True)

In [60]:
aux_df

Unnamed: 0,Model name,precison_multclass_cv,precison_std,recall_multclass_cv,recall_cv,balanced_score_cv,balanced_std,f1_score_cv,f1_std
0,xgboost Cross_Val,0.529,0.028,0.549,0.017,0.549,0.017,0.537,0.022
1,xgboost Cross_Val,0.551,0.024,0.541,0.021,0.541,0.021,0.544,0.023
2,xgboost Cross_Val,0.499,0.028,0.603,0.036,0.603,0.036,0.541,0.029
3,xgboost Cross_Val,0.522,0.028,0.554,0.022,0.554,0.022,0.536,0.025
4,xgboost Cross_Val,0.507,0.021,0.594,0.033,0.594,0.033,0.542,0.023


In [89]:
best_paramns = {'colsample_bytree': 0.7336051564716887, 'max_depth': 3.0, 'min_child_weight': 5.0, 'n_estimators': 300, 'seed': 42}

In [72]:
sample_weights = class_weight.compute_sample_weight(class_weight='balanced',y=y_train)

In [90]:
# #model
aux_df = pd.DataFrame()
for i in best_paramns:
    best_paramns = dict(i)
    xgb_model_final = xgb.XGBClassifier(n_estimators =int(best_paramns['n_estimators']),
                                          max_depth = int(best_paramns['max_depth']), 
                                          min_child_weight=int(best_paramns['min_child_weight']),
                                          colsample_bytree=int(best_paramns['colsample_bytree']),
                                          seed = 42).fit(X_train,y_train,sample_weight=sample_weights)
# #predict
    yhat_xgb = xgb_model_final.predict(X_val)

# #metrics
    aux_df = pd.concat([aux_df,mult_metrics('xgboost',y_val,yhat_xgb)],ignore_index=True)

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [88]:
aux_df

Unnamed: 0,mode_name,precison_multclass,recall_multclass,f1-score_multclass,balanced_score
0,xgboost,0.500102,0.535557,0.516092,0.535557
1,xgboost,0.498477,0.524873,0.510465,0.524873
2,xgboost,0.480031,0.567759,0.515661,0.567759
3,xgboost,0.485623,0.542528,0.510607,0.542528
4,xgboost,0.490911,0.582738,0.525884,0.582738
