# Load libraries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.tools import *

import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [3]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [5]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9003741709983868, 'f1_score': 0.9241142395957508}

In [6]:
feature_importance.to_csv('../data/feature_imporstance_WS.csv', index = False)

### Find best amount of features

In [7]:
search_num_features(df_fe,feature_importance, step = 5)

  0%|          | 0/25 [00:00<?, ?it/s]

F1_score - 0.6734304006608839, num_features - 0, AUC_score = 0.6151303997132103


  4%|▍         | 1/25 [00:03<01:17,  3.21s/it]

F1_score - 0.7779974810761374, num_features - 1, AUC_score = 0.7291629324251659


  8%|▊         | 2/25 [00:06<01:16,  3.34s/it]

F1_score - 0.9065504135926671, num_features - 6, AUC_score = 0.8820465137121347


 12%|█▏        | 3/25 [00:11<01:20,  3.64s/it]

F1_score - 0.9036473988915333, num_features - 11, AUC_score = 0.8775822279978491


 16%|█▌        | 4/25 [00:17<01:31,  4.35s/it]

F1_score - 0.8996349620459928, num_features - 11, AUC_score = 0.8772685517117763


 20%|██        | 5/25 [00:23<01:37,  4.88s/it]

F1_score - 0.9045308175559443, num_features - 11, AUC_score = 0.8818896755690984


 24%|██▍       | 6/25 [00:29<01:42,  5.37s/it]

F1_score - 0.9290100951057022, num_features - 11, AUC_score = 0.9049952948557088


 28%|██▊       | 7/25 [00:37<01:46,  5.92s/it]

F1_score - 0.9345060499209441, num_features - 31, AUC_score = 0.9139238662842804


 32%|███▏      | 8/25 [00:44<01:47,  6.35s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 36%|███▌      | 9/25 [00:52<01:51,  6.94s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 40%|████      | 10/25 [01:01<01:52,  7.50s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 44%|████▍     | 11/25 [01:10<01:51,  7.97s/it]

F1_score - 0.926224010731053, num_features - 36, AUC_score = 0.9005310091414231


 48%|████▊     | 12/25 [01:20<01:49,  8.42s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 52%|█████▏    | 13/25 [01:30<01:47,  9.00s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 56%|█████▌    | 14/25 [01:41<01:45,  9.56s/it]

F1_score - 0.9290100951057022, num_features - 36, AUC_score = 0.9049952948557088


 60%|██████    | 15/25 [01:51<01:38,  9.86s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 64%|██████▍   | 16/25 [02:05<01:38, 10.94s/it]

F1_score - 0.9296546462113844, num_features - 36, AUC_score = 0.9093027424269583


 68%|██████▊   | 17/25 [02:17<01:29, 11.23s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 72%|███████▏  | 18/25 [02:29<01:20, 11.56s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 76%|███████▌  | 19/25 [02:42<01:11, 11.86s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 80%|████████  | 20/25 [02:54<01:00, 12.15s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 84%|████████▍ | 21/25 [03:09<00:51, 12.80s/it]

F1_score - 0.9268971302323933, num_features - 36, AUC_score = 0.9048384567126725


 88%|████████▊ | 22/25 [03:23<00:40, 13.36s/it]

F1_score - 0.9323871436056093, num_features - 36, AUC_score = 0.913767028141244


 92%|█████████▏| 23/25 [03:38<00:27, 13.68s/it]

F1_score - 0.9329922027290448, num_features - 36, AUC_score = 0.9180744757124933


 96%|█████████▌| 24/25 [03:54<00:14, 14.31s/it]

F1_score - 0.9296546462113844, num_features - 36, AUC_score = 0.9093027424269583


100%|██████████| 25/25 [04:09<00:00, 14.68s/it]


Best F1_score - 0.9345060499209441, num_features - 36


## Test data with upsampling

### Find feature importance

In [8]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = upsampling(1,X_train, y_train)

train_pool = Pool(data=X_train, label=y_train)

In [10]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9225387614267789, 'f1_score': 0.9356730254831562}

In [11]:
feature_importance.to_csv('../data/feature_imporstance.csv',index = False)

### Find best amount of features

In [12]:
search_num_features(df_fe, feature_importance, upsamp_func = True, step = 5)

  0%|          | 0/25 [00:00<?, ?it/s]

F1_score - 0.5030190363209421, num_features - 0, AUC_score = 0.7000358487184083


  4%|▍         | 1/25 [00:08<03:32,  8.86s/it]

F1_score - 0.8971506553910363, num_features - 1, AUC_score = 0.9194019985660513


  8%|▊         | 2/25 [00:18<03:25,  8.95s/it]

F1_score - 0.9133894516722362, num_features - 6, AUC_score = 0.9080480372826671


 12%|█▏        | 3/25 [00:28<03:24,  9.30s/it]

F1_score - 0.9187804265750446, num_features - 11, AUC_score = 0.9169766087112386


 16%|█▌        | 4/25 [00:39<03:28,  9.93s/it]

F1_score - 0.9173488073540239, num_features - 16, AUC_score = 0.90836171356874


 20%|██        | 5/25 [00:50<03:26, 10.32s/it]

F1_score - 0.9173488073540239, num_features - 16, AUC_score = 0.90836171356874


 24%|██▍       | 6/25 [01:03<03:27, 10.93s/it]

F1_score - 0.9268971302323933, num_features - 16, AUC_score = 0.9048384567126725


 28%|██▊       | 7/25 [01:15<03:24, 11.36s/it]

F1_score - 0.9267946579421356, num_features - 31, AUC_score = 0.9176039612833842


 32%|███▏      | 8/25 [01:29<03:24, 12.01s/it]

F1_score - 0.9329922027290448, num_features - 31, AUC_score = 0.9180744757124933


 36%|███▌      | 9/25 [01:42<03:20, 12.55s/it]

F1_score - 0.9247648902821317, num_features - 41, AUC_score = 0.9174471231403479


 40%|████      | 10/25 [01:56<03:15, 13.00s/it]

F1_score - 0.9267946579421356, num_features - 41, AUC_score = 0.9176039612833842


 44%|████▍     | 11/25 [02:11<03:08, 13.48s/it]

F1_score - 0.9315175832361914, num_features - 41, AUC_score = 0.9222250851407061


 48%|████▊     | 12/25 [02:26<03:01, 13.93s/it]

F1_score - 0.9288423001618658, num_features - 41, AUC_score = 0.9177607994264206


 52%|█████▏    | 13/25 [02:42<02:53, 14.47s/it]

F1_score - 0.9335861468177414, num_features - 41, AUC_score = 0.9223819232837426


 56%|█████▌    | 14/25 [02:59<02:50, 15.47s/it]

F1_score - 0.9335861468177414, num_features - 66, AUC_score = 0.9223819232837426


 60%|██████    | 15/25 [03:17<02:41, 16.15s/it]

F1_score - 0.9356730254831562, num_features - 66, AUC_score = 0.9225387614267789


 64%|██████▍   | 16/25 [03:38<02:36, 17.43s/it]

F1_score - 0.9275576538631527, num_features - 76, AUC_score = 0.9091459042839218


 68%|██████▊   | 17/25 [03:59<02:28, 18.53s/it]

F1_score - 0.9315175832361914, num_features - 76, AUC_score = 0.9222250851407061


 72%|███████▏  | 18/25 [04:18<02:10, 18.63s/it]

F1_score - 0.9377784739626844, num_features - 76, AUC_score = 0.9226955995698154


 76%|███████▌  | 19/25 [04:38<01:54, 19.10s/it]

F1_score - 0.9356730254831562, num_features - 91, AUC_score = 0.9225387614267789


 80%|████████  | 20/25 [05:00<01:40, 20.17s/it]

F1_score - 0.9309080644078345, num_features - 91, AUC_score = 0.9179176375694569


 84%|████████▍ | 21/25 [05:23<01:23, 20.89s/it]

F1_score - 0.935094971863561, num_features - 91, AUC_score = 0.9182313138555297


 88%|████████▊ | 22/25 [05:45<01:03, 21.22s/it]

F1_score - 0.9399027517255195, num_features - 91, AUC_score = 0.9228524377128516


 92%|█████████▏| 23/25 [06:06<00:42, 21.24s/it]

F1_score - 0.9366441795347409, num_features - 111, AUC_score = 0.9140807044273167


 96%|█████████▌| 24/25 [06:29<00:21, 21.69s/it]

F1_score - 0.9399027517255195, num_features - 111, AUC_score = 0.9228524377128516


100%|██████████| 25/25 [06:48<00:00, 20.87s/it]


Best F1_score - 0.9399027517255195, num_features - 111
