# Load libraries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.tools import *

import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [3]:
CatBoost = CatBoostClassifier(verbose=False)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [5]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9094595805699947, 'f1_score': 0.931770654883567}

In [6]:
feature_importance.to_csv('../data/feature_imporstance_WS.csv',index = False)

### Fing best amount of features

In [7]:
search_num_features(df_fe,feature_importance, step = 5)

  4%|▍         | 1/24 [00:02<01:03,  2.75s/it]

F1_score - 0.6734304006608839, num_features - 0, AUC_score = 0.6151303997132103


  8%|▊         | 2/24 [00:05<01:03,  2.90s/it]

F1_score - 0.8076319061320266, num_features - 1, AUC_score = 0.7482747804265998


 12%|█▎        | 3/24 [00:09<01:12,  3.44s/it]

F1_score - 0.9110560725537689, num_features - 6, AUC_score = 0.8951256945689192


 17%|█▋        | 4/24 [00:13<01:10,  3.52s/it]

F1_score - 0.9102490313731457, num_features - 11, AUC_score = 0.8908182469976698


 21%|██        | 5/24 [00:17<01:12,  3.80s/it]

F1_score - 0.9045308175559443, num_features - 11, AUC_score = 0.8818896755690984


 25%|██▌       | 6/24 [00:22<01:14,  4.11s/it]

F1_score - 0.9345060499209441, num_features - 11, AUC_score = 0.9139238662842804


 29%|██▉       | 7/24 [00:28<01:19,  4.69s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 33%|███▎      | 8/24 [00:34<01:23,  5.23s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 38%|███▊      | 9/24 [00:42<01:28,  5.89s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 42%|████▏     | 10/24 [00:49<01:28,  6.29s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 46%|████▌     | 11/24 [00:56<01:26,  6.69s/it]

F1_score - 0.926224010731053, num_features - 26, AUC_score = 0.9005310091414231


 50%|█████     | 12/24 [01:04<01:24,  7.06s/it]

F1_score - 0.926224010731053, num_features - 26, AUC_score = 0.9005310091414231


 54%|█████▍    | 13/24 [01:12<01:19,  7.24s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 58%|█████▊    | 14/24 [01:21<01:16,  7.67s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 62%|██████▎   | 15/24 [01:29<01:11,  7.92s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 67%|██████▋   | 16/24 [01:39<01:08,  8.51s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 71%|███████   | 17/24 [01:50<01:04,  9.16s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 75%|███████▌  | 18/24 [02:00<00:56,  9.43s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 79%|███████▉  | 19/24 [02:10<00:47,  9.58s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


 83%|████████▎ | 20/24 [02:21<00:40, 10.05s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 88%|████████▊ | 21/24 [02:32<00:31, 10.44s/it]

F1_score - 0.931770654883567, num_features - 26, AUC_score = 0.9094595805699947


 92%|█████████▏| 22/24 [02:44<00:21, 10.90s/it]

F1_score - 0.9296546462113844, num_features - 26, AUC_score = 0.9093027424269583


 96%|█████████▌| 23/24 [02:55<00:10, 10.99s/it]

F1_score - 0.9290100951057022, num_features - 26, AUC_score = 0.9049952948557088


100%|██████████| 24/24 [03:06<00:00,  7.76s/it]

F1_score - 0.9268971302323933, num_features - 26, AUC_score = 0.9048384567126725
Best F1_score - 0.9345060499209441, num_features - 26





## Test data with upsampling

### Find feature importance

In [8]:
CatBoost = CatBoostClassifier(verbose=False)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = upsampling(X_train, y_train)

train_pool = Pool(data=X_train, label=y_train)

In [10]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9128259992830255, 'f1_score': 0.9200626959247649}

In [11]:
feature_importance.to_csv('../data/feature_imporstance.csv',index = False)

### Fing best amount of features

In [12]:
search_num_features(df_fe, feature_importance,upsamp_func = True, step = 10)

  8%|▊         | 1/12 [00:03<00:40,  3.66s/it]

F1_score - 0.5209647121898241, num_features - 0, AUC_score = 0.7293421760172074


 17%|█▋        | 2/12 [00:08<00:42,  4.29s/it]

F1_score - 0.7889639386341534, num_features - 1, AUC_score = 0.7752901505646173


 25%|██▌       | 3/12 [00:13<00:43,  4.80s/it]

F1_score - 0.9148758169934641, num_features - 11, AUC_score = 0.9166629324251659


 33%|███▎      | 4/12 [00:20<00:43,  5.43s/it]

F1_score - 0.9072626441181693, num_features - 21, AUC_score = 0.9160355798530203


 42%|████▏     | 5/12 [00:27<00:43,  6.23s/it]

F1_score - 0.907577522853558, num_features - 21, AUC_score = 0.907577522853558


 50%|█████     | 6/12 [00:36<00:41,  6.90s/it]

F1_score - 0.940437986558478, num_features - 21, AUC_score = 0.9271598852841012


 58%|█████▊    | 7/12 [00:45<00:38,  7.77s/it]

F1_score - 0.9345060499209441, num_features - 51, AUC_score = 0.9139238662842804


 67%|██████▋   | 8/12 [00:55<00:33,  8.43s/it]

F1_score - 0.9180711029331965, num_features - 51, AUC_score = 0.9126691611399891


 75%|███████▌  | 9/12 [01:06<00:27,  9.24s/it]

F1_score - 0.9294670846394985, num_features - 51, AUC_score = 0.9220682469976698


 83%|████████▎ | 10/12 [01:19<00:20, 10.26s/it]

F1_score - 0.9335861468177414, num_features - 51, AUC_score = 0.9223819232837426


 92%|█████████▏| 11/12 [01:32<00:11, 11.31s/it]

F1_score - 0.9141393080936184, num_features - 51, AUC_score = 0.9123554848539164


100%|██████████| 12/12 [01:46<00:00,  8.84s/it]

F1_score - 0.931770654883567, num_features - 51, AUC_score = 0.9094595805699947
Best F1_score - 0.940437986558478, num_features - 51



