# Load libraries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.tools import *

import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [3]:
CatBoost = CatBoostClassifier(verbose=False)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [5]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9094595805699947, 'f1_score': 0.931770654883567}

In [6]:
feature_importance.to_csv('../data/feature_imporstance_WS.csv',index = False)

### Find best amount of features

In [7]:
search_num_features(df_fe,feature_importance, step = 5)

  0%|          | 0/24 [00:00<?, ?it/s]

F1_score - 0.6734304006608839, num_features - 0, AUC_score = 0.6151303997132103


  4%|▍         | 1/24 [00:02<01:02,  2.70s/it]

F1_score - 0.8076319061320266, num_features - 1, AUC_score = 0.7482747804265998


  8%|▊         | 2/24 [00:05<01:00,  2.73s/it]

F1_score - 0.9110560725537689, num_features - 6, AUC_score = 0.8951256945689192


 12%|█▎        | 3/24 [00:08<00:58,  2.80s/it]

F1_score - 0.9102490313731457, num_features - 11, AUC_score = 0.8908182469976698


 17%|█▋        | 4/24 [00:11<00:59,  2.98s/it]

F1_score - 0.9045308175559443, num_features - 11, AUC_score = 0.8818896755690984


 21%|██        | 5/24 [00:15<00:59,  3.13s/it]

F1_score - 0.9345060499209441, num_features - 11, AUC_score = 0.9139238662842804


 25%|██▌       | 6/24 [00:19<01:00,  3.38s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 29%|██▉       | 7/24 [00:23<01:01,  3.63s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 33%|███▎      | 8/24 [00:27<01:02,  3.88s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 38%|███▊      | 9/24 [00:33<01:05,  4.39s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 42%|████▏     | 10/24 [00:39<01:08,  4.91s/it]

F1_score - 0.926224010731053, num_features - 26, AUC_score = 0.9005310091414231


 46%|████▌     | 11/24 [00:46<01:10,  5.39s/it]

F1_score - 0.926224010731053, num_features - 26, AUC_score = 0.9005310091414231


 50%|█████     | 12/24 [00:52<01:09,  5.78s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 54%|█████▍    | 13/24 [00:59<01:07,  6.16s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 58%|█████▊    | 14/24 [01:07<01:05,  6.56s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 62%|██████▎   | 15/24 [01:14<01:01,  6.86s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 67%|██████▋   | 16/24 [01:23<00:57,  7.22s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 71%|███████   | 17/24 [01:31<00:52,  7.53s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 75%|███████▌  | 18/24 [01:39<00:47,  7.84s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 79%|███████▉  | 19/24 [01:48<00:40,  8.12s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 83%|████████▎ | 20/24 [01:58<00:34,  8.75s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 88%|████████▊ | 21/24 [02:08<00:27,  9.14s/it]

F1_score - 0.9296546462113844, num_features - 26, AUC_score = 0.9093027424269583


 92%|█████████▏| 22/24 [02:19<00:18,  9.48s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 96%|█████████▌| 23/24 [02:29<00:09,  9.71s/it]

F1_score - 0.9268971302323933, num_features - 26, AUC_score = 0.9048384567126725


100%|██████████| 24/24 [02:39<00:00,  9.86s/it]


Best F1_score - 0.9345060499209441, num_features - 26


## Test data with upsampling

### Find feature importance

In [8]:
CatBoost = CatBoostClassifier(verbose=False)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = upsampling(X_train, y_train)

train_pool = Pool(data=X_train, label=y_train)

In [10]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9128259992830255, 'f1_score': 0.9200626959247649}

In [11]:
feature_importance.to_csv('../data/feature_imporstance.csv',index = False)

### Find best amount of features

In [12]:
search_num_features(df_fe, feature_importance,upsamp_func = True, step = 5)

  0%|          | 0/12 [00:00<?, ?it/s]

F1_score - 0.5134960492124424, num_features - 0, AUC_score = 0.7089756228714823


  8%|▊         | 1/12 [00:06<01:11,  6.48s/it]

F1_score - 0.9260620182828463, num_features - 1, AUC_score = 0.9260620182828464


 17%|█▋        | 2/12 [00:13<01:06,  6.61s/it]

F1_score - 0.9160967176018916, num_features - 11, AUC_score = 0.9125123229969528


 25%|██▌       | 3/12 [00:21<01:02,  6.99s/it]

F1_score - 0.9254193045350148, num_features - 11, AUC_score = 0.9217545707115969


 33%|███▎      | 4/12 [00:29<00:59,  7.44s/it]

F1_score - 0.9356730254831562, num_features - 11, AUC_score = 0.9225387614267789


 42%|████▏     | 5/12 [00:39<00:57,  8.22s/it]

F1_score - 0.9207580110684531, num_features - 41, AUC_score = 0.917133446854275


 50%|█████     | 6/12 [00:51<00:54,  9.15s/it]

F1_score - 0.9399027517255195, num_features - 41, AUC_score = 0.9228524377128516


 58%|█████▊    | 7/12 [01:02<00:49,  9.95s/it]

F1_score - 0.9282059314954052, num_features - 61, AUC_score = 0.9134533518551712


 67%|██████▋   | 8/12 [01:15<00:42, 10.63s/it]

F1_score - 0.9261431033325127, num_features - 61, AUC_score = 0.9132965137121348


 75%|███████▌  | 9/12 [01:28<00:34, 11.36s/it]

F1_score - 0.9267946579421356, num_features - 61, AUC_score = 0.9176039612833842


 83%|████████▎ | 10/12 [01:42<00:24, 12.26s/it]

F1_score - 0.9247648902821317, num_features - 61, AUC_score = 0.9174471231403479


 92%|█████████▏| 11/12 [01:57<00:13, 13.21s/it]

F1_score - 0.9220717326480798, num_features - 61, AUC_score = 0.912982837426062


100%|██████████| 12/12 [02:11<00:00, 13.44s/it]


Best F1_score - 0.9399027517255195, num_features - 61
