# Load libraries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.tools import *

import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [3]:
CatBoost = CatBoostClassifier(verbose=False)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [5]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9048384567126725, 'f1_score': 0.9268971302323933}

In [6]:
feature_importance.to_csv('../data/feature_imporstance_WS.csv', index = False)

### Find best amount of features

In [7]:
search_num_features(df_fe,feature_importance, step = 5)

  0%|          | 0/24 [00:00<?, ?it/s]

F1_score - 0.6734304006608839, num_features - 0, AUC_score = 0.6151303997132103


  4%|▍         | 1/24 [00:02<00:59,  2.61s/it]

F1_score - 0.9053976276635861, num_features - 1, AUC_score = 0.8861971231403478


  8%|▊         | 2/24 [00:05<00:57,  2.62s/it]

F1_score - 0.9053976276635861, num_features - 6, AUC_score = 0.8861971231403478


 12%|█▎        | 3/24 [00:08<00:55,  2.66s/it]

F1_score - 0.9074030316276981, num_features - 6, AUC_score = 0.8863539612833841


 17%|█▋        | 4/24 [00:11<00:57,  2.89s/it]

F1_score - 0.9045308175559443, num_features - 16, AUC_score = 0.8818896755690984


 21%|██        | 5/24 [00:15<01:00,  3.21s/it]

F1_score - 0.9345060499209441, num_features - 16, AUC_score = 0.9139238662842804


 25%|██▌       | 6/24 [00:19<01:05,  3.62s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 29%|██▉       | 7/24 [00:24<01:07,  3.95s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 33%|███▎      | 8/24 [00:29<01:07,  4.24s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 38%|███▊      | 9/24 [00:34<01:07,  4.50s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 42%|████▏     | 10/24 [00:40<01:07,  4.82s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 46%|████▌     | 11/24 [00:46<01:09,  5.33s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 50%|█████     | 12/24 [00:54<01:11,  5.98s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 54%|█████▍    | 13/24 [01:01<01:08,  6.25s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 58%|█████▊    | 14/24 [01:08<01:06,  6.63s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 62%|██████▎   | 15/24 [01:16<01:02,  6.95s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 67%|██████▋   | 16/24 [01:24<00:57,  7.15s/it]

F1_score - 0.926224010731053, num_features - 26, AUC_score = 0.9005310091414231


 71%|███████   | 17/24 [01:33<00:54,  7.85s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 75%|███████▌  | 18/24 [01:42<00:49,  8.23s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 79%|███████▉  | 19/24 [01:52<00:43,  8.73s/it]

F1_score - 0.9296546462113844, num_features - 26, AUC_score = 0.9093027424269583


 83%|████████▎ | 20/24 [02:03<00:37,  9.36s/it]

F1_score - 0.9296546462113844, num_features - 26, AUC_score = 0.9093027424269583


 88%|████████▊ | 21/24 [02:13<00:29,  9.69s/it]

F1_score - 0.935094971863561, num_features - 26, AUC_score = 0.9182313138555297


 92%|█████████▏| 22/24 [02:24<00:19,  9.87s/it]

F1_score - 0.9296546462113844, num_features - 106, AUC_score = 0.9093027424269583


 96%|█████████▌| 23/24 [02:35<00:10, 10.30s/it]

F1_score - 0.9268971302323933, num_features - 106, AUC_score = 0.9048384567126725


100%|██████████| 24/24 [02:45<00:00, 10.38s/it]


Best F1_score - 0.935094971863561, num_features - 106


## Test data with upsampling

### Find feature importance

In [8]:
CatBoost = CatBoostClassifier(verbose=False)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = upsampling(1,X_train, y_train)

train_pool = Pool(data=X_train, label=y_train)

In [10]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9177607994264206, 'f1_score': 0.9288423001618658}

In [11]:
feature_importance.to_csv('../data/feature_imporstance.csv',index = False)

### Find best amount of features

In [12]:
search_num_features(df_fe, feature_importance, upsamp_func = True, step = 5)

  0%|          | 0/24 [00:00<?, ?it/s]

F1_score - 0.5131137281674916, num_features - 0, AUC_score = 0.7229118121527156


  4%|▍         | 1/24 [00:07<02:41,  7.01s/it]

F1_score - 0.8715129252358282, num_features - 1, AUC_score = 0.8875246459939058


  8%|▊         | 2/24 [00:14<02:39,  7.23s/it]

F1_score - 0.9114354241353301, num_features - 6, AUC_score = 0.9078911991396308


 12%|█▎        | 3/24 [00:22<02:36,  7.44s/it]

F1_score - 0.9106583072100314, num_features - 11, AUC_score = 0.9035837515683814


 17%|█▋        | 4/24 [00:31<02:35,  7.78s/it]

F1_score - 0.9207580110684531, num_features - 11, AUC_score = 0.917133446854275


 21%|██        | 5/24 [00:40<02:36,  8.24s/it]

F1_score - 0.9288423001618658, num_features - 21, AUC_score = 0.9177607994264206


 25%|██▌       | 6/24 [00:49<02:30,  8.34s/it]

F1_score - 0.9309080644078345, num_features - 26, AUC_score = 0.9179176375694569


 29%|██▉       | 7/24 [00:58<02:25,  8.53s/it]

F1_score - 0.921378142257191, num_features - 31, AUC_score = 0.9086753898548127


 33%|███▎      | 8/24 [01:07<02:21,  8.86s/it]

F1_score - 0.9227527541941567, num_features - 31, AUC_score = 0.9172902849973114


 38%|███▊      | 9/24 [01:17<02:17,  9.19s/it]

F1_score - 0.9220717326480798, num_features - 31, AUC_score = 0.912982837426062


 42%|████▏     | 10/24 [01:27<02:12,  9.46s/it]

F1_score - 0.9288423001618658, num_features - 31, AUC_score = 0.9177607994264206


 46%|████▌     | 11/24 [01:38<02:06,  9.74s/it]

F1_score - 0.9261431033325127, num_features - 31, AUC_score = 0.9132965137121348


 50%|█████     | 12/24 [01:49<02:02, 10.18s/it]

F1_score - 0.9356730254831562, num_features - 31, AUC_score = 0.9225387614267789


 54%|█████▍    | 13/24 [02:01<01:56, 10.63s/it]

F1_score - 0.9240984535059902, num_features - 61, AUC_score = 0.9131396755690984


 58%|█████▊    | 14/24 [02:13<01:50, 11.06s/it]

F1_score - 0.9335861468177414, num_features - 61, AUC_score = 0.9223819232837426


 62%|██████▎   | 15/24 [02:26<01:45, 11.77s/it]

F1_score - 0.9240984535059902, num_features - 61, AUC_score = 0.9131396755690984


 67%|██████▋   | 16/24 [02:41<01:42, 12.83s/it]

F1_score - 0.9335861468177414, num_features - 61, AUC_score = 0.9223819232837426


 71%|███████   | 17/24 [02:54<01:29, 12.80s/it]

F1_score - 0.9294670846394985, num_features - 61, AUC_score = 0.9220682469976698


 75%|███████▌  | 18/24 [03:09<01:20, 13.40s/it]

F1_score - 0.9254193045350148, num_features - 61, AUC_score = 0.9217545707115969


 79%|███████▉  | 19/24 [03:21<01:05, 13.15s/it]

F1_score - 0.9356730254831562, num_features - 61, AUC_score = 0.9225387614267789


 83%|████████▎ | 20/24 [03:34<00:51, 12.94s/it]

F1_score - 0.9227281641206964, num_features - 61, AUC_score = 0.9045247804265998


 88%|████████▊ | 21/24 [03:49<00:40, 13.56s/it]

F1_score - 0.9356730254831562, num_features - 61, AUC_score = 0.9225387614267789


 92%|█████████▏| 22/24 [04:04<00:28, 14.01s/it]

F1_score - 0.9267946579421356, num_features - 61, AUC_score = 0.9176039612833842


 96%|█████████▌| 23/24 [04:18<00:13, 13.87s/it]

F1_score - 0.9220717326480798, num_features - 61, AUC_score = 0.912982837426062


100%|██████████| 24/24 [04:32<00:00, 14.14s/it]


Best F1_score - 0.9356730254831562, num_features - 61
