### Import Libs

In [91]:
import pandas as pd 
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.encoding import OneHotEncoder

### Load Data

In [92]:
df = pd.read_csv('../processed_data/features-target.csv')

In [93]:
df.head()

Unnamed: 0,machineID,dtRef,volt_mean_24h,volt_std_24h,volt_min_24h,volt_max_24h,volt_max_rate_24h,pressure_mean_24h,pressure_std_24h,pressure_min_24h,...,days_since_last_error,days_since_last_rep,model,age,count_comp2_last15days,count_comp4_last15days,count_comp3_last15days,count_comp1_last15days,avg_days_between_mtn,target
0,1,2015-02-02,164.947744,12.526438,142.559604,190.98583,1.12029,100.862632,7.021747,87.52926,...,5.833333,12.75,model3,18,0.0,0.0,0.0,0.0,33.285714,0.0
1,2,2015-02-02,167.38845,17.7001,135.350069,216.299453,1.268775,99.735153,9.236172,81.742015,...,20.416667,14.75,model4,7,0.0,0.0,0.0,0.0,54.0,0.0
2,3,2015-02-02,166.065714,13.018281,139.126762,192.111761,1.126894,100.077484,12.097118,78.093179,...,4.291667,10.75,model3,8,0.0,0.0,0.0,0.0,44.0,1.0
3,4,2015-02-02,173.652648,12.136008,152.527285,192.391288,1.128534,100.701927,10.328765,81.286954,...,16.75,15.75,model3,7,0.0,0.0,0.0,0.0,53.75,0.0
4,5,2015-02-02,167.550151,13.873202,140.783515,195.575448,1.147212,102.314697,12.696205,76.24439,...,1.208333,8.75,model3,2,0.0,0.0,0.0,0.0,32.4,0.0


The creation of the dataset is described in the notebook "pdm-exploration-features.ipynb".

### Data Description

In [94]:
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Number of rows: 33300
Number of columns: 37


In [95]:
df.dtypes

machineID                           int64
dtRef                              object
volt_mean_24h                     float64
volt_std_24h                      float64
volt_min_24h                      float64
volt_max_24h                      float64
volt_max_rate_24h                 float64
pressure_mean_24h                 float64
pressure_std_24h                  float64
pressure_min_24h                  float64
pressure_max_24h                  float64
pressure_max_rate_24h             float64
rotate_mean_24h                   float64
rotate_std_24h                    float64
rotate_min_24h                    float64
rotate_max_24h                    float64
rotate_max_rate_24h               float64
vibration_mean_24h                float64
vibration_std_24h                 float64
vibration_min_24h                 float64
vibration_max_24h                 float64
vibration_max_rate_24h            float64
volt_count_exceed_thr_24h         float64
pressure_count_exceed_thr_24h     

In [96]:
df.isna().sum()

machineID                            0
dtRef                                0
volt_mean_24h                        0
volt_std_24h                         0
volt_min_24h                         0
volt_max_24h                         0
volt_max_rate_24h                    0
pressure_mean_24h                    0
pressure_std_24h                     0
pressure_min_24h                     0
pressure_max_24h                     0
pressure_max_rate_24h                0
rotate_mean_24h                      0
rotate_std_24h                       0
rotate_min_24h                       0
rotate_max_24h                       0
rotate_max_rate_24h                  0
vibration_mean_24h                   0
vibration_std_24h                    0
vibration_min_24h                    0
vibration_max_24h                    0
vibration_max_rate_24h               0
volt_count_exceed_thr_24h            0
pressure_count_exceed_thr_24h        0
rotate_count_exceed_thr_24h          0
vibration_count_exceed_th

The variable "days_since_last_error/failure" should indicate if a variable had an error before or not. So, since it has nan values, we can assume machines that didn't have errors or failures and this value should be very high. This is a consideration for the pre-processing part.

In [97]:
df.describe()

Unnamed: 0,machineID,volt_mean_24h,volt_std_24h,volt_min_24h,volt_max_24h,volt_max_rate_24h,pressure_mean_24h,pressure_std_24h,pressure_min_24h,pressure_max_24h,...,days_since_last_failure,days_since_last_error,days_since_last_rep,age,count_comp2_last15days,count_comp4_last15days,count_comp3_last15days,count_comp1_last15days,avg_days_between_mtn,target
count,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,...,31574.0,33280.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0,33300.0
mean,50.5,170.763748,14.917677,141.435012,200.179853,1.17422,100.850793,10.055646,81.111708,120.643253,...,175.426811,9.743944,9.46012,11.33,0.0,0.0,0.0,0.0,22.82429,0.131171
std,28.866503,4.723867,2.258733,8.30425,8.746914,0.051308,4.702029,1.719436,6.258909,7.243204,...,95.433892,9.404948,7.137116,5.827703,0.0,0.0,0.0,0.0,7.687006,0.337593
min,1.0,157.745683,7.383772,97.333604,176.709005,1.036544,90.973896,4.51385,51.237106,101.486195,...,0.75,0.0,0.75,0.0,0.0,0.0,0.0,0.0,12.368421,0.0
25%,25.75,168.041613,13.350064,136.383798,194.085891,1.138474,98.686702,8.929385,77.547924,116.061671,...,93.75,2.875,4.75,6.75,0.0,0.0,0.0,0.0,17.807143,0.0
50%,50.5,170.197981,14.841608,141.802377,199.131765,1.168073,100.113396,9.923726,81.210152,119.371371,...,174.75,6.875,8.75,12.0,0.0,0.0,0.0,0.0,20.391304,0.0
75%,75.25,172.470906,16.41822,146.76979,204.938971,1.202137,101.612057,10.991411,84.403417,123.337288,...,254.75,13.583333,12.75,16.0,0.0,0.0,0.0,0.0,25.214286,0.0
max,100.0,218.265191,26.725027,191.645382,255.124717,1.496518,152.3146,28.399538,136.619446,185.951998,...,362.875,79.0,157.75,20.0,0.0,0.0,0.0,0.0,70.0,1.0


The idea here is the pre-processing and modeling part. So, I will not take into account all other steps.

### Split Train / OOT

In [98]:
df_train = df[df['dtRef'] <= '2015-11-30']

df_oot = df[df['dtRef'] > '2015-11-30']

The idea here is to separate our database in 2 things: Train and Test on new data.

- Train dataset is used to TRAIN and VALIDATE our model, to do so, I will consider the examples before december.
- Out of Time dataset is used only to TEST our model in data that is never seen by the model. This split is considered only after december.

### Pre-Processing

In [99]:
# Split the dataset in trian and 
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['dtRef', 'machineID', 'target'], axis = 1), df_train['target'], test_size = 0.3, random_state=42)


In [100]:
# Conver values to integer instead of float
y_train = y_train.astype(int)
y_val = y_val.astype(int)

In [101]:
print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

X_train: (21140, 34)
X_val: (9060, 34)
y_train: (21140,)
y_val: (9060,)


In [102]:
# Target proportion
y_train.value_counts(normalize = True)

target
0    0.8693
1    0.1307
Name: proportion, dtype: float64

In [103]:
# Target proportion
y_val.value_counts(normalize = True)

target
0    0.863245
1    0.136755
Name: proportion, dtype: float64

In [104]:
df_train.dtypes

machineID                           int64
dtRef                              object
volt_mean_24h                     float64
volt_std_24h                      float64
volt_min_24h                      float64
volt_max_24h                      float64
volt_max_rate_24h                 float64
pressure_mean_24h                 float64
pressure_std_24h                  float64
pressure_min_24h                  float64
pressure_max_24h                  float64
pressure_max_rate_24h             float64
rotate_mean_24h                   float64
rotate_std_24h                    float64
rotate_min_24h                    float64
rotate_max_24h                    float64
rotate_max_rate_24h               float64
vibration_mean_24h                float64
vibration_std_24h                 float64
vibration_min_24h                 float64
vibration_max_24h                 float64
vibration_max_rate_24h            float64
volt_count_exceed_thr_24h         float64
pressure_count_exceed_thr_24h     

In [105]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.selection import DropConstantFeatures
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [106]:
# Numerical features to be pre-processed by the pipeline
numeric_features = ['volt_mean_24h', 'volt_std_24h', 'volt_min_24h',
       'volt_max_24h', 'volt_max_rate_24h', 'pressure_mean_24h',
       'pressure_std_24h', 'pressure_min_24h', 'pressure_max_24h',
       'pressure_max_rate_24h', 'rotate_mean_24h', 'rotate_std_24h',
       'rotate_min_24h', 'rotate_max_24h', 'rotate_max_rate_24h',
       'vibration_mean_24h', 'vibration_std_24h', 'vibration_min_24h',
       'vibration_max_24h', 'vibration_max_rate_24h',
       'volt_count_exceed_thr_24h', 'pressure_count_exceed_thr_24h',
       'rotate_count_exceed_thr_24h', 'vibration_count_exceed_thr_24h',
       'days_since_last_failure', 'days_since_last_error',
       'days_since_last_rep', 'age', 'count_comp2_last15days',
       'count_comp4_last15days', 'count_comp3_last15days',
       'count_comp1_last15days', 'avg_days_between_mtn']

# Categorical features to be pre-processed by the pipeline
categorical_features = ['model']

# The response
target = 'target'

# Features to fill missing values
imputer_features = ['days_since_last_failure', 'days_since_last_error']

# Imputer transformation
imputer_transformer = Pipeline(
    steps=[("imputer", ArbitraryNumberImputer(arbitrary_number=10000))]
)

# Numerical transformations
numeric_transformer = Pipeline(
    steps=[("scaler", MinMaxScaler())]
)

# Categorical transformations
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
]
)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder='passthrough'
    
)


### Feature Selection

In [107]:
smart_corr = SmartCorrelatedSelection(
    variables=None,
    method="pearson",
    threshold=0.7,
    missing_values="raise",
    selection_method="variance"
)

In [108]:
feat_selector_pipe = Pipeline([
    ('imputer', ArbitraryNumberImputer(arbitrary_number=10000, variables = imputer_features)),
    ('preprocessor', preprocessor),
    ('selector_cte', DropConstantFeatures()),
    ('selector_corr', smart_corr)
])


df_feat = feat_selector_pipe.fit_transform(X_train, y_train)

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


In [109]:
all_vars = list(feat_selector_pipe['preprocessor'].get_feature_names_out())
idx = [int(x[1:]) for x in df_feat.columns]

selected_features = [all_vars[i].split('__')[1] for i in idx]

print("Features:", selected_features)
print("Number of Selected Features:", len(selected_features))

Features: ['volt_mean_24h', 'volt_std_24h', 'volt_min_24h', 'volt_max_24h', 'pressure_std_24h', 'pressure_min_24h', 'pressure_max_24h', 'rotate_mean_24h', 'rotate_std_24h', 'rotate_min_24h', 'rotate_max_24h', 'vibration_mean_24h', 'vibration_std_24h', 'vibration_min_24h', 'vibration_max_24h', 'volt_count_exceed_thr_24h', 'rotate_count_exceed_thr_24h', 'days_since_last_failure', 'days_since_last_error', 'days_since_last_rep', 'age', 'avg_days_between_mtn', 'model_model3', 'model_model1', 'model_model2', 'model_model4']
Number of Selected Features: 26


### Modeling

In [110]:
def evaluate(y_true, y_pred_prob, threshold = 0.5):
    # convert probabilities to binary predictions
    y_pred = (y_pred_prob > threshold).astype(int)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_prob)
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("ROC-AUC:", roc_auc)
    

In [111]:
def evaluate_models(model_list, X, y, cv = 5, threshold = 0.5):

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    results_df = pd.DataFrame([], columns = ['Model', 'Train Accuracy', 'Train Precision', 'Train Recall', 'Validation Accuracy', 'Validation Precision', 'Validation Recall'])

    for name, model in model_list:

        train_accuracy = []
        train_precision  = []
        train_recall  = []

        val_accuracy = []
        val_precision = []
        val_recall = []

        train_roc_auc = []
        val_roc_auc = []

        for train_index, val_index in skf.split(X, y):

            X_train_fold = X.iloc[train_index, :]
            X_val_fold = X.iloc[val_index, :]

            y_train_fold = y.iloc[train_index]
            y_val_fold = y.iloc[val_index]

            model_pipe = Pipeline([
                                ('imputer', ArbitraryNumberImputer(arbitrary_number=10000, variables = imputer_features)),
                                ('preprocessor', preprocessor),
                                ('selector_cte', DropConstantFeatures()),
                                ('selector_corr', smart_corr),
                                ('model', model)
                            ])
            
            model_pipe.fit(X_train_fold, y_train_fold)

            # Predict on the training and testing sets
            y_train_pred_prob = model_pipe.predict_proba(X_train_fold)
            y_train_pred = (y_train_pred_prob[:,1] > threshold).astype(int)
            y_val_pred_prob = model_pipe.predict_proba(X_val_fold)
            y_val_pred = (y_val_pred_prob[:,1] > threshold).astype(int)

            

            # Calculate the evaluation metrics for the training set
            train_accuracy.append(accuracy_score(y_train_fold, y_train_pred))
            train_precision.append(precision_score(y_train_fold, y_train_pred))
            train_recall.append(recall_score(y_train_fold, y_train_pred))
            train_roc_auc.append(roc_auc_score(y_train_fold, y_train_pred_prob[:, 1]))
            # Calculate the evaluation metrics for the validation set
            val_accuracy.append(accuracy_score(y_val_fold, y_val_pred))
            val_precision.append(precision_score(y_val_fold, y_val_pred))
            val_recall.append(recall_score(y_val_fold, y_val_pred))
            val_roc_auc.append(roc_auc_score(y_val_fold, y_val_pred_prob[:, 1]))



        model_metrics = [name, np.mean(train_accuracy), np.mean(train_precision), np.mean(train_recall), np.mean(train_roc_auc), np.mean(val_accuracy), np.mean(val_precision), np.mean(val_recall), np.mean(train_roc_auc)]

        results_df = pd.concat([results_df, pd.DataFrame([model_metrics], columns = ['Model', 'Train Accuracy', 'Train Precision', 'Train Recall', 'Train ROC-AUC', 'Validation Accuracy', 'Validation Precision', 'Validation Recall', 'Validation ROC-AUC'])], ignore_index=True)
        
    return results_df

In [112]:
results_df = evaluate_models([('RandomForestClassifier', RandomForestClassifier(random_state = 42)), ('XGBoost', XGBClassifier(random_state = 42)), ('CatBoostClassifier', CatBoostClassifier(random_state = 42))], X_train, y_train)
results_df

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.034465
0:	learn: 0.6281994	total: 17.5ms	remaining: 17.4s
1:	learn: 0.5790471	total: 27.3ms	remaining: 13.6s
2:	learn: 0.5571356	total: 40.4ms	remaining: 13.4s
3:	learn: 0.5144037	total: 51.6ms	remaining: 12.8s
4:	learn: 0.4988168	total: 62.6ms	remaining: 12.5s
5:	learn: 0.4626379	total: 71.2ms	remaining: 11.8s
6:	learn: 0.4523293	total: 83.6ms	remaining: 11.9s
7:	learn: 0.4405085	total: 98.6ms	remaining: 12.2s
8:	learn: 0.4197252	total: 110ms	remaining: 12.1s
9:	learn: 0.4117480	total: 118ms	remaining: 11.7s
10:	learn: 0.4042493	total: 132ms	remaining: 11.9s
11:	learn: 0.3978956	total: 145ms	remaining: 11.9s
12:	learn: 0.3913082	total: 159ms	remaining: 12.1s
13:	learn: 0.3844245	total: 178ms	remaining: 12.5s
14:	learn: 0.3678448	total: 196ms	remaining: 12.9s
15:	learn: 0.3615344	total: 213ms	remaining: 13.1s
16:	learn: 0.3493328	total: 229ms	remaining: 13.3s
17:	learn: 0.3388157	total: 247ms	remaining: 13.5s
18:	learn: 0.3344237	total: 273ms	remaining: 14.1s
19:

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.034465
0:	learn: 0.6257608	total: 12.8ms	remaining: 12.8s
1:	learn: 0.5767756	total: 23.3ms	remaining: 11.6s
2:	learn: 0.5540392	total: 31.7ms	remaining: 10.5s
3:	learn: 0.5122632	total: 43.2ms	remaining: 10.7s
4:	learn: 0.4966328	total: 57.2ms	remaining: 11.4s
5:	learn: 0.4611499	total: 74.6ms	remaining: 12.4s
6:	learn: 0.4506610	total: 91.4ms	remaining: 13s
7:	learn: 0.4409751	total: 105ms	remaining: 13s
8:	learn: 0.4313065	total: 122ms	remaining: 13.4s
9:	learn: 0.4226842	total: 139ms	remaining: 13.7s
10:	learn: 0.4145201	total: 154ms	remaining: 13.8s
11:	learn: 0.4074861	total: 166ms	remaining: 13.7s
12:	learn: 0.4004156	total: 177ms	remaining: 13.4s
13:	learn: 0.3882003	total: 187ms	remaining: 13.2s
14:	learn: 0.3706979	total: 203ms	remaining: 13.3s
15:	learn: 0.3549761	total: 221ms	remaining: 13.6s
16:	learn: 0.3429635	total: 240ms	remaining: 13.9s
17:	learn: 0.3371637	total: 250ms	remaining: 13.7s
18:	learn: 0.3318742	total: 264ms	remaining: 13.6s
19:	lear

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.034465
0:	learn: 0.6280095	total: 12.2ms	remaining: 12.2s
1:	learn: 0.5786845	total: 22.2ms	remaining: 11.1s
2:	learn: 0.5561245	total: 31.3ms	remaining: 10.4s
3:	learn: 0.5391828	total: 42ms	remaining: 10.5s
4:	learn: 0.5176839	total: 51.3ms	remaining: 10.2s
5:	learn: 0.4790066	total: 59.4ms	remaining: 9.85s
6:	learn: 0.4672805	total: 71.9ms	remaining: 10.2s
7:	learn: 0.4412752	total: 103ms	remaining: 12.7s
8:	learn: 0.4315653	total: 125ms	remaining: 13.8s
9:	learn: 0.4222222	total: 140ms	remaining: 13.9s
10:	learn: 0.4133444	total: 154ms	remaining: 13.9s
11:	learn: 0.4019495	total: 168ms	remaining: 13.8s
12:	learn: 0.3950405	total: 181ms	remaining: 13.7s
13:	learn: 0.3749967	total: 191ms	remaining: 13.5s
14:	learn: 0.3555670	total: 206ms	remaining: 13.5s
15:	learn: 0.3507307	total: 215ms	remaining: 13.2s
16:	learn: 0.3469490	total: 230ms	remaining: 13.3s
17:	learn: 0.3350536	total: 249ms	remaining: 13.6s
18:	learn: 0.3303823	total: 258ms	remaining: 13.3s
19:	le

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.034465
0:	learn: 0.6281764	total: 17ms	remaining: 17s
1:	learn: 0.5780337	total: 37.1ms	remaining: 18.5s
2:	learn: 0.5561569	total: 59.5ms	remaining: 19.8s
3:	learn: 0.5393867	total: 75.7ms	remaining: 18.9s
4:	learn: 0.4957417	total: 85.9ms	remaining: 17.1s
5:	learn: 0.4599903	total: 104ms	remaining: 17.2s
6:	learn: 0.4494798	total: 128ms	remaining: 18.1s
7:	learn: 0.4397834	total: 155ms	remaining: 19.2s
8:	learn: 0.4302960	total: 211ms	remaining: 23.2s
9:	learn: 0.4217400	total: 236ms	remaining: 23.4s
10:	learn: 0.4137187	total: 251ms	remaining: 22.6s
11:	learn: 0.3949577	total: 291ms	remaining: 24s
12:	learn: 0.3880918	total: 304ms	remaining: 23.1s
13:	learn: 0.3765523	total: 318ms	remaining: 22.4s
14:	learn: 0.3611634	total: 336ms	remaining: 22.1s
15:	learn: 0.3560446	total: 373ms	remaining: 23s
16:	learn: 0.3518380	total: 417ms	remaining: 24.1s
17:	learn: 0.3471708	total: 455ms	remaining: 24.8s
18:	learn: 0.3416046	total: 487ms	remaining: 25.1s
19:	learn: 0.3

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.034465
0:	learn: 0.6224519	total: 8.19ms	remaining: 8.18s
1:	learn: 0.5731798	total: 16.2ms	remaining: 8.09s
2:	learn: 0.5512021	total: 25.9ms	remaining: 8.62s
3:	learn: 0.5095994	total: 33.7ms	remaining: 8.39s
4:	learn: 0.4941671	total: 43.7ms	remaining: 8.7s
5:	learn: 0.4581119	total: 51.6ms	remaining: 8.55s
6:	learn: 0.4466159	total: 65.5ms	remaining: 9.29s
7:	learn: 0.4233286	total: 79.6ms	remaining: 9.88s
8:	learn: 0.4145013	total: 111ms	remaining: 12.3s
9:	learn: 0.4069730	total: 126ms	remaining: 12.5s
10:	learn: 0.3996277	total: 140ms	remaining: 12.6s
11:	learn: 0.3914141	total: 153ms	remaining: 12.6s
12:	learn: 0.3840660	total: 161ms	remaining: 12.2s
13:	learn: 0.3588279	total: 173ms	remaining: 12.2s
14:	learn: 0.3421069	total: 188ms	remaining: 12.4s
15:	learn: 0.3377493	total: 210ms	remaining: 12.9s
16:	learn: 0.3283514	total: 231ms	remaining: 13.3s
17:	learn: 0.3247784	total: 249ms	remaining: 13.6s
18:	learn: 0.3203433	total: 264ms	remaining: 13.6s
19:	

Unnamed: 0,Model,Train Accuracy,Train Precision,Train Recall,Validation Accuracy,Validation Precision,Validation Recall,Train ROC-AUC,Validation ROC-AUC
0,RandomForestClassifier,1.0,1.0,1.0,0.899574,0.844114,0.284121,1.0,1.0
1,XGBoost,0.992928,0.999045,0.946797,0.909745,0.753434,0.460381,0.99988,0.99988
2,CatBoostClassifier,0.962051,0.991479,0.7158,0.915326,0.839162,0.435758,0.994781,0.994781


In [114]:
model = CatBoostClassifier()
model_pipe = Pipeline([
                                ('imputer', ArbitraryNumberImputer(arbitrary_number=10000, variables = imputer_features)),
                                ('preprocessor', preprocessor),
                                ('selector_cte', DropConstantFeatures()),
                                ('selector_corr', smart_corr),
                                ('model', model)
                            ])
            
model_pipe.fit(X_train, y_train)


  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Learning rate set to 0.03791
0:	learn: 0.6236009	total: 15.1ms	remaining: 15.1s
1:	learn: 0.5643641	total: 26.1ms	remaining: 13s
2:	learn: 0.5162967	total: 36.1ms	remaining: 12s
3:	learn: 0.5009869	total: 45.2ms	remaining: 11.2s
4:	learn: 0.4848909	total: 54ms	remaining: 10.7s
5:	learn: 0.4510679	total: 63ms	remaining: 10.4s
6:	learn: 0.4332357	total: 72.6ms	remaining: 10.3s
7:	learn: 0.4218302	total: 81ms	remaining: 10s
8:	learn: 0.4113471	total: 92.8ms	remaining: 10.2s
9:	learn: 0.3909348	total: 105ms	remaining: 10.4s
10:	learn: 0.3847280	total: 114ms	remaining: 10.3s
11:	learn: 0.3771670	total: 124ms	remaining: 10.2s
12:	learn: 0.3700476	total: 135ms	remaining: 10.2s
13:	learn: 0.3621033	total: 143ms	remaining: 10.1s
14:	learn: 0.3567183	total: 154ms	remaining: 10.1s
15:	learn: 0.3347601	total: 164ms	remaining: 10.1s
16:	learn: 0.3299918	total: 178ms	remaining: 10.3s
17:	learn: 0.3262296	total: 194ms	remaining: 10.6s
18:	learn: 0.3134650	total: 204ms	remaining: 10.5s
19:	learn: 0.30

In [117]:
y_pred = model_pipe.predict_proba(df_train.drop(['machineID', 'dtRef', 'target'], axis = 1))[:, 1]

In [138]:
df_score = df_train[['dtRef', 'machineID', 'target']].copy()
df_score['prob'] = y_pred

In [127]:
df_machine = df_score[df_score['machineID'] == 99]

In [150]:
df_failures = pd.read_csv('../data/PdM_failures.csv')
df_failures['dtRef'] = df_failures['datetime'].apply(lambda x: x.split(' ')[0])

In [148]:
df_failures.loc[0]['datetime'].split(' ')

['2015-01-05', '06:00:00']

In [151]:
df_failures[df_failures['machineID'] == 99]

Unnamed: 0,datetime,machineID,failure,dtRef
739,2015-01-02 03:00:00,99,comp3,2015-01-02
740,2015-01-18 06:00:00,99,comp4,2015-01-18
741,2015-02-02 06:00:00,99,comp1,2015-02-02
742,2015-02-17 06:00:00,99,comp2,2015-02-17
743,2015-03-04 06:00:00,99,comp3,2015-03-04
744,2015-03-19 06:00:00,99,comp4,2015-03-19
745,2015-04-03 06:00:00,99,comp2,2015-04-03
746,2015-04-18 06:00:00,99,comp3,2015-04-18
747,2015-05-03 06:00:00,99,comp2,2015-05-03
748,2015-05-18 06:00:00,99,comp1,2015-05-18


In [155]:
df_comp_machine = pd.merge(df_machine, df_failures[['dtRef', 'machineID', 'failure']], on = ['dtRef','machineID'], how = 'left').fillna('No')

In [156]:
df_comp_machine.head(50)

Unnamed: 0,dtRef,machineID,target,prob,failure
0,2015-02-02,99,1.0,0.984657,comp1
1,2015-02-03,99,0.0,3.9e-05,No
2,2015-02-04,99,0.0,3.8e-05,No
3,2015-02-05,99,0.0,2.7e-05,No
4,2015-02-06,99,0.0,2.8e-05,No
5,2015-02-07,99,0.0,3e-05,No
6,2015-02-08,99,0.0,3.1e-05,No
7,2015-02-09,99,0.0,5.3e-05,No
8,2015-02-10,99,0.0,0.000152,No
9,2015-02-11,99,1.0,0.613715,No


### Evaluation

In [113]:
# Costs 
early_mtn_cost = 800
failure_production_cost = 40.7
maintenance_cost = 250
production_shortage_cost = 1380
repair_cost = 1

### Next Steps

- Tunning Model to Reduce Overfitting
- Review target - post-processing to reduce FP
- Add diff moving averages
- Evaluate Costs