# imports

In [1]:
#manipulação de dados
import pandas as pd
import numpy  as np

#visualização
import matplotlib.pyplot as plt
import seaborn           as sns
from IPython.core.display import HTML
from IPython.display      import Image

# processamento de dados
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import pickle

#machine learning models
import xgboost as xgb

#metricas
from sklearn.metrics import accuracy_score, balanced_accuracy_score,precision_score,recall_score,roc_auc_score, confusion_matrix,f1_score

## Helper functions 

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [3]:
def mult_metrics(model_name,y,yhat):
    return pd.DataFrame({'mode_name':model_name,
             'precison_multclass':precision_score(y,yhat,average='macro'),
             'recall_multclass':recall_score(y,yhat,average='macro'),
              'f1-score_multclass':f1_score(y,yhat,average='macro'),
             'balanced_score':balanced_accuracy_score(y,yhat)},index=[0])
    

In [4]:
def confusion_m(y,yhat):
    from sklearn.metrics import confusion_matrix
    ax= plt.subplot()
    cm = confusion_matrix(y,yhat)
    sns.heatmap(cm,annot=True,ax=ax,annot_kws={'size': 20})
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']); ax.yaxis.set_ticklabels(['Heat Dissipation Failure', 'No Failure','Overstrain Failure','Power Failure','Random Failures','Tool Wear Failure']);
    plt.yticks(rotation=0) ;

In [5]:
def performace_cross_val(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        model.fit(X.iloc[train_index], y.iloc[train_index])

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])


In [6]:
def performace_cross_val_boost(data, target, model, model_name, round_n=3, splits=3, shuffle_n=True, random=42):
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits, shuffle=shuffle_n, random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target, 'product_id'])
    precision = []
    recall = []
    balan_acc = []
    f1 = []
    for train_index, test_index in skf.split(X, y):
        # train the model
        weight = class_weight.compute_sample_weight(class_weight='balanced',y=y.iloc[train_index])
        model.fit(X.iloc[train_index], y.iloc[train_index],sample_weight=weight)

        # predict the classification
        yhat_class = model.predict(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index], yhat_class,average='macro')
        rec = sklearn.metrics.recall_score(y.iloc[test_index], yhat_class,average='macro')
        balan_acc_score = sklearn.metrics.balanced_accuracy_score(y.iloc[test_index], yhat_class)
        f1_score = sklearn.metrics.f1_score(y.iloc[test_index], yhat_class,average='macro')
        precision.append(prec)
        recall.append(rec)
        balan_acc.append(balan_acc_score)
        f1.append(f1_score)

    # return a dataset with the metrics
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                         'precison_multclass_cv': np.round(np.mean(precision), round_n),
                         'precison_std': np.round(np.std(precision), round_n),
                         'recall_multclass_cv': np.round(np.mean(recall), round_n),
                         'recall_cv': np.round(np.std(recall), round_n),
                         'balanced_score_cv': np.round(np.mean(balan_acc), round_n),
                         'balanced_std': np.round(np.std(balan_acc), round_n),
                         'f1_score_cv': np.round(np.mean(f1), round_n),
                         'f1_std': np.round(np.std(f1), round_n)},index=[0])

# Load test data

In [7]:
path_local = 'C:/Users/Lavin/Documents/desafios/desafio_indicium/'
df_test = pd.read_csv(path_local + 'data/raw/desafio_manutencao_preditiva_teste.csv')

In [8]:
df_test

Unnamed: 0,udi,product_id,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,446,L47625,L,297.5,308.6,1793,26.7,70
1,7076,L54255,L,300.7,310.5,1536,47.4,192
2,1191,L48370,L,297.2,308.4,1460,42.1,41
3,2618,L49797,L,299.4,309.1,1670,35.9,68
4,5067,L52246,L,304.1,313.1,1550,30.9,9
...,...,...,...,...,...,...,...,...
3328,5554,L52733,L,302.5,311.9,1306,59.7,172
3329,6961,L54140,L,300.7,311.0,1413,52.0,91
3330,6914,L54093,L,300.8,311.2,1481,38.5,181
3331,5510,L52689,L,302.8,312.2,1509,36.5,52


# Indicium class

In [9]:
import pickle
import numpy  as np
import pandas as pd

class indicium:
    
    def __init__( self ):
        self.home_path = 'C:/Users/Lavin/Documents/desafios/desafio_indicium/'
        self.rotational_speed_rpm_scaler =   pickle.load(open(self.home_path + 'src/features/rotational_speed_rpm_scaler.pkl','rb'))
        self.torque_nm_scaler =              pickle.load(open(self.home_path + 'src/features/torque_nm_scaler.pkl','rb'))
        self.air_temperature_k_scaler =      pickle.load(open(self.home_path + 'src/features/air_temperature_k_scaler.pkl','rb'))
        self.process_temperature_k_scaler =  pickle.load(open(self.home_path + 'src/features/process_temperature_k_scaler.pkl','rb'))
        self.tool_wear_min_scaler =          pickle.load(open(self.home_path + 'src/features/tool_wear_min_scaler.pkl','rb')) 
        self.power_w_scaler =                pickle.load(open(self.home_path + 'src/features/power_w_scaler.pkl','rb'))
        self.encoding_failure_type =         pickle.load(open(self.home_path + 'src/features/encoding_failure_type.pkl','rb')) 
                                           
    def data_cleaning(self,df1):
        # nenhum processo de limpeza nesse ciclo
                                           
        return df1
    
    def feature_engineering(self,df2):
           
        # Nova coluna power_w
        df2['power_w'] = df2['torque_nm'] * df2['rotational_speed_rpm'] * np.pi / 30                                    
                                           
        return df2
    
    def data_preparation(self,df3):
    
        #robustscaler
        #aplicando a escala previamente salva
        df3['rotational_speed_rpm'] = self.rotational_speed_rpm_scaler.transform(df3[['rotational_speed_rpm']].values)


        #aplicando a escala previamente salva
        df3['torque_nm'] =  self.torque_nm_scaler.transform(df3[['torque_nm']].values)

        #min-max scaler
        # aplicando a escala previamente salva
        df3['air_temperature_k'] = self.air_temperature_k_scaler.transform(df3[['air_temperature_k']].values)


        # aplicando a escala previamente salva
        df3['process_temperature_k'] = self.process_temperature_k_scaler.transform(df3[['process_temperature_k']].values)


        # aplicando a escala previamente salva
        df3['tool_wear_min'] = self.tool_wear_min_scaler.transform(df3[['tool_wear_min']].values)

        # aplicando a escala previamente salva
        df3['power_w'] = self.power_w_scaler.transform(df3[['power_w']].values)



        #frequêncy encoding
        freq = np.round_(df3['type'].value_counts(normalize=True),2).to_dict()
        df3['type'] = df3['type'].map(freq)
                                           
        #feature selection
        cols_select_final = ['air_temperature_k',
                             'process_temperature_k', 
                             'rotational_speed_rpm', 
                             'torque_nm','tool_wear_min', 
                             'power_w']
        
        return df3[cols_select_final]

    def get_prediction( self, model, original_data, test_data ):
        # model prediction
        pred = model.predict( test_data )
        
        # join prediction into original data
        original_data['predictedValues'] = pred
        original_data['predictedValues'] = self.encoding_failure_type.inverse_transform(original_data[['prediction']])
        
        return original_data
                                       

# 10.0 previsão

In [10]:
model = pickle.load(open(path_local +'src/models/xgb_model_final.pkl','rb'))

In [11]:
from indicium import indicium


pipeline = indicium()

df1 = pipeline.data_cleaning( df_test )
        
        # feature engineering
df2 = pipeline.feature_engineering( df1 )
        
        # data preparation
df3 = pipeline.data_preparation( df2 )
        
        # prediction
df_response = pipeline.get_prediction( model, df_test, df3 )


  y = column_or_1d(y, warn=True)


In [23]:
df_response.head()

Unnamed: 0,udi,product_id,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,power_w,predictedValues
0,446,L47625,0.59,0.23913,0.358025,1.530343,-0.992647,-29.098765,581.180162,Power Failure
1,7076,L54255,0.59,0.586957,0.592593,0.174142,0.529412,-14.037037,903.52696,Power Failure
2,1191,L48370,0.59,0.206522,0.333333,-0.226913,0.139706,-32.679012,756.914132,Power Failure
3,2618,L49797,0.59,0.445652,0.419753,0.881266,-0.316176,-29.345679,737.353516,Power Failure
4,5067,L52246,0.59,0.956522,0.91358,0.248021,-0.683824,-36.62963,581.463293,Power Failure


In [24]:
df = df_response[['udi','predictedValues']]

In [28]:
#df.to_csv('../predicted.csv',index=False)