# imports

In [1]:
#manipulação de dados
import pandas as pd
import numpy  as np

#visualização
import matplotlib.pyplot as plt
import seaborn           as sns
from IPython.core.display import HTML
from IPython.display      import Image

# processamento de dados
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

#machine learning models
import xgboost as xgb
import lightgbm as lgb

#metricas
from sklearn.metrics import accuracy_score, balanced_accuracy_score,precision_score,recall_score,roc_auc_score, confusion_matrix,f1_score

## Helper functions 

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [3]:
def mult_metrics(model_name,y,yhat):
    return pd.DataFrame({'mode_name':model_name,
             'precison_multclass':precision_score(y,yhat,average='macro'),
             'recall_multclass':recall_score(y,yhat,average='macro'),
              'f1-score_multclass':f1_score(y,yhat,average='macro'),
             'balanced_score':balanced_accuracy_score(y,yhat)},index=[0])
    

In [4]:
def confusion_m(y,yhat):
    from sklearn.metrics import confusion_matrix
    ax= plt.subplot()
    cm = confusion_matrix(y,yhat)
    sns.heatmap(cm,annot=True,ax=ax,annot_kws={'size': 20})
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']); ax.yaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']);
    plt.yticks(rotation=0) ;

In [5]:
def performace_cross_val(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        model.fit(X.iloc[train_index], y.iloc[train_index])

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])


In [6]:
def performace_cross_val_boost(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        weight = class_weight.compute_sample_weight(class_weight='balanced',y=y.iloc[train_index])
        model.fit(X.iloc[train_index], y.iloc[train_index],sample_weight=weight)

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])

# Load data

In [7]:
path_local = 'C:/Users/Lavin/Documents/desafios/desafio_indicium/'
df4 = pd.read_csv(path_local + 'data/processed/df_train_processed.csv')

In [8]:
df4.head()

Unnamed: 0,product_id,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,M14860,0.304348,0.358025,0.253298,0.191176,-1.0,1
1,L47181,0.315217,0.37037,-0.501319,0.448529,-0.972222,1
2,L47184,0.315217,0.37037,-0.501319,-0.014706,-0.916667,1
3,M14865,0.304348,0.358025,-0.411609,0.125,-0.898148,1
4,L47186,0.304348,0.358025,0.290237,0.161765,-0.87037,1


## Split dataframe

In [9]:
X = df4.copy()
y = X['failure_type']
X = X.drop(columns=['failure_type','product_id'])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X_train.shape

(4666, 5)

In [12]:
X_val.shape

(2001, 5)

# Fine tuning

In [13]:
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK

import warnings
warnings.filterwarnings('ignore')

In [14]:
space={'n_estimators': hp.choice('n_estimators',np.arange(300,1000+1,250)),
      'max_depth': hp.quniform("max_depth", 3, 10, 1),
      'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
      'min_child_weight' : hp.quniform('min_child_weight', 0, 101, 2),
      'seed': 42
    }

In [15]:
# def objective_function(space):
#     xgb_model = xgb.XGBClassifier(n_estimators =int(space['n_estimators']),
#                                   max_depth = int(space['max_depth']), 
#                                   min_child_weight=int(space['min_child_weight']),
#                                   colsample_bytree=int(space['colsample_bytree']),
#                                   seed = space['seed'])      
#     score = performace_cross_val_boost(df4,'failure_type',xgb_model,'xgb_model',round_n=3,splits=3)
#     print(space)
#     print(score['f1_score_cv'][0])
#     return {'loss': -score['f1_score_cv'][0],'status':STATUS_OK}

In [16]:
# tpe_algorithm = tpe.suggest
# trials = Trials()
# num_eval = 100

In [17]:
# best_paramns = fmin(fn=objective_function,space=space,algo=tpe_algorithm,max_evals=num_eval,trials=trials)

{'colsample_bytree': 0.7081801033404811, 'max_depth': 4.0, 'min_child_weight': 98.0, 'n_estimators': 550, 'seed': 42}
0.393                                                  
{'colsample_bytree': 0.8828185312449635, 'max_depth': 8.0, 'min_child_weight': 56.0, 'n_estimators': 800, 'seed': 42}
0.423                                                                
{'colsample_bytree': 0.6335807600252841, 'max_depth': 8.0, 'min_child_weight': 48.0, 'n_estimators': 800, 'seed': 42}
0.433                                                                
{'colsample_bytree': 0.8735274147420538, 'max_depth': 5.0, 'min_child_weight': 50.0, 'n_estimators': 550, 'seed': 42}
0.431                                                                
{'colsample_bytree': 0.9810727407557882, 'max_depth': 7.0, 'min_child_weight': 20.0, 'n_estimators': 300, 'seed': 42}
0.456                                                                
{'colsample_bytree': 0.9020312691303298, 'max_depth': 10.0, 'min_child_wei

In [18]:
# best_paramns

{'colsample_bytree': 0.6939415415289769,
 'max_depth': 3.0,
 'min_child_weight': 28.0,
 'n_estimators': 1}

In [20]:
# {'colsample_bytree': 0.7861913023654323, 'max_depth': 3.0, 'min_child_weight': 6.0, 'n_estimators': 300, 'seed': 42} 0.576
# {'colsample_bytree': 0.8919692193040883, 'max_depth': 3.0, 'min_child_weight': 10.0, 'n_estimators': 800, 'seed': 42} 0.588 
#{'colsample_bytree': 0.6787531787719704, 'max_depth': 3.0, 'min_child_weight': 9.0, 'n_estimators': 300, 'seed': 42} 0.589 
# {'colsample_bytree': 0.944694905734099, 'max_depth': 3.0, 'min_child_weight': 13.0, 'n_estimators': 800, 'seed': 42} 0.598 
#{'colsample_bytree': 0.5177112616620141, 'max_depth': 3.0, 'min_child_weight': 13.0, 'n_estimators': 300, 'seed': 42} 0.603 
# {'colsample_bytree': 0.6081083296444376, 'max_depth': 3.0, 'min_child_weight': 26.0, 'n_estimators': 550, 'seed': 42} 0.634

In [38]:
best_paramns_recall = [{'colsample_bytree': 0.7861913023654323, 'max_depth': 3.0, 'min_child_weight': 6.0, 'n_estimators': 300, 'seed': 42},
{'colsample_bytree': 0.8919692193040883, 'max_depth': 3.0, 'min_child_weight': 10.0, 'n_estimators': 800, 'seed': 42}, 
{'colsample_bytree': 0.6787531787719704, 'max_depth': 3.0, 'min_child_weight': 9.0, 'n_estimators': 300, 'seed': 42}, 
{'colsample_bytree': 0.944694905734099, 'max_depth': 3.0, 'min_child_weight': 13.0, 'n_estimators': 800, 'seed': 42}, 
{'colsample_bytree': 0.5177112616620141, 'max_depth': 3.0, 'min_child_weight': 13.0, 'n_estimators': 300, 'seed': 42}, 
{'colsample_bytree': 0.6081083296444376, 'max_depth': 3.0, 'min_child_weight': 26.0, 'n_estimators': 550, 'seed': 42}]

## XGboost

In [47]:
aux_df = pd.DataFrame()
for i in best_paramns:
    best_paramns_select_balance_acc = dict(i)
    xgb_model_final = xgb.XGBClassifier(n_estimators =int(best_paramns_select_balance_acc['n_estimators']),
                                          max_depth = int(best_paramns_select_balance_acc['max_depth']), 
                                          min_child_weight=int(best_paramns_select_balance_acc['min_child_weight']),
                                          colsample_bytree=int(best_paramns_select_balance_acc['colsample_bytree']),
                                          seed = 42).fit(X_train,y_train)
    aux_df = pd.concat([aux_df,performace_cross_val_boost(df4,'failure_type',xgb_model_final,'xgboost')],ignore_index=True)

In [48]:
aux_df

Unnamed: 0,Model name,precison_multclass_cv,precison_std,recall_multclass_cv,recall_cv,balanced_score_cv,balanced_std,f1_score_cv,f1_std
0,xgboost Cross_Val,0.448,0.022,0.576,0.039,0.576,0.039,0.497,0.026
1,xgboost Cross_Val,0.451,0.033,0.588,0.044,0.588,0.044,0.503,0.037
2,xgboost Cross_Val,0.436,0.019,0.589,0.031,0.589,0.031,0.491,0.021
3,xgboost Cross_Val,0.439,0.035,0.598,0.043,0.598,0.043,0.497,0.037
4,xgboost Cross_Val,0.42,0.03,0.603,0.038,0.603,0.038,0.481,0.032
5,xgboost Cross_Val,0.396,0.025,0.634,0.044,0.634,0.044,0.466,0.029


In [22]:
# #model
# xgb_model = xgb.XGBClassifier(n_estimators=300,seed=42).fit(X_train,y_train,sample_weight=sample_weights)

# #predict
# yhat_xgb = xgb_model.predict(X_val)

# #metrics
# xgb_metrics = mult_metrics('xgboost',y_val,yhat_xgb)
# xgb_metrics

Unnamed: 0,mode_name,precison_multclass,recall_multclass,f1-score_multclass,balanced_score
0,xgboost,0.52194,0.581552,0.547671,0.581552


## lightgbm

In [23]:
# #model
# lgb_model = lgb.LGBMClassifier(n_estimators=300,seed=42).fit(X_train,y_train,sample_weight=sample_weights)

# #predict
# yhat_lgb = lgb_model.predict(X_val)

# #metrics
# lgb_metrics = mult_metrics('light',y_val,yhat_lgb)
# lgb_metrics

Unnamed: 0,mode_name,precison_multclass,recall_multclass,f1-score_multclass,balanced_score
0,light,0.544234,0.521123,0.531485,0.521123


# Balanced comparison

In [24]:
# model_comp = pd.concat([lr_metrics,rf_metrics,et_metrics,xgb_metrics,lgb_metrics,brf_metrics])

In [25]:
# model_comp.sort_values(by='recall_multclass',ascending=False)

## cross validation

In [26]:
# models = {
#     'XGBoost':xgb_model,
#     'Lightgbm':lgb_model
# }

In [27]:
# models_comparison = pd.DataFrame()
# for c, v in models.items():
#     aux = performace_cross_val_boost(df4,'failure_type',v,c)
#     models_comparison = pd.concat([models_comparison,aux])

In [28]:
#models_comparison.sort_values(by='recall_multclass_cv',ascending=False)

Unnamed: 0,Model name,precison_multclass_cv,precison_std,recall_multclass_cv,recall_cv,balanced_score_cv,balanced_std,f1_score_cv,f1_std
0,XGBoost Cross_Val,0.536,0.03,0.566,0.01,0.566,0.01,0.547,0.018
0,Lightgbm Cross_Val,0.566,0.034,0.545,0.019,0.545,0.019,0.552,0.029
