# Load libraries

In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.tools import *

import warnings
warnings.filterwarnings("ignore")

In [3]:
def check_result(model,X_test,y_test):
  metrics_dict = {}
  fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict(X_test))
  auc_score = metrics.auc(fpr, tpr)
  f1_sc = metrics.f1_score(y_test, model.predict(X_test), average='macro')
  metrics_dict['auc_score'] = auc_score
  metrics_dict['f1_score'] = f1_sc
  return metrics_dict

In [4]:
def search_num_features(df, feature_importance, upsamp_func: Optional[Literal['ROS', 'RUS', 'SMOTE', 'OSS']] = None, p = 0.3, step = 5):
  best_score = 0
  best_num_features = 0

  for num_col in tqdm(range(1, len(feature_importance), step)):
    features = list(feature_importance.iloc[:num_col,:]['feature_names'])
    data = df[features + ['Machine failure']]
    X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = ['Machine failure']),
                                                        data['Machine failure'],
                                                        test_size=0.33,
                                                        random_state=42,
                                                        stratify = df['Machine failure'])
    
    if upsamp_func != 'ROS':
      X_train, y_train = DataSampler.ROS(X_train, y_train, p)
    elif upsamp_func != 'RUS':
      X_train, y_train = DataSampler.RUS(X_train, y_train, p)
    elif upsamp_func != 'SMOTE':
      X_train, y_train = DataSampler.SMOTE(X_train, y_train, p)
    elif upsamp_func != 'OSS':
      X_train, y_train = DataSampler.OSS(X_train, y_train)
    
    train_pool = Pool(data=X_train, label=y_train)
    CatBoost = CatBoostClassifier(verbose=False,random_seed=42)
    CatBoost.fit(train_pool)
    metrics_dict = check_result(CatBoost, X_test, y_test)
    print(f'F1_score - {metrics_dict["f1_score"]}, num_features - {best_num_features}, AUC_score = {metrics_dict["auc_score"]}')
    if metrics_dict['f1_score'] > best_score:
      best_score = metrics_dict['f1_score']
      best_num_features = num_col
  print(f'Best F1_score - {best_score}, num_features - {best_num_features}')


# Read data

In [5]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [5]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [7]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9003741709983868, 'f1_score': 0.9241142395957508}

In [8]:
feature_importance.to_csv('../data/feature_imporstance/feature_imporstance_WS.csv', index = False)

### Find best amount of features

In [9]:
search_num_features(df_fe,feature_importance, step = 5)

  0%|          | 0/25 [00:00<?, ?it/s]

F1_score - 0.6734304006608839, num_features - 0, AUC_score = 0.6151303997132103


  4%|▍         | 1/25 [00:04<01:50,  4.61s/it]

F1_score - 0.7779974810761374, num_features - 1, AUC_score = 0.7291629324251659


  8%|▊         | 2/25 [00:09<01:46,  4.62s/it]

F1_score - 0.9065504135926671, num_features - 6, AUC_score = 0.8820465137121347


 12%|█▏        | 3/25 [00:14<01:42,  4.66s/it]

F1_score - 0.9036473988915333, num_features - 11, AUC_score = 0.8775822279978491


 16%|█▌        | 4/25 [00:19<01:41,  4.82s/it]

F1_score - 0.8996349620459928, num_features - 11, AUC_score = 0.8772685517117763


 20%|██        | 5/25 [00:24<01:36,  4.82s/it]

F1_score - 0.9045308175559443, num_features - 11, AUC_score = 0.8818896755690984


 24%|██▍       | 6/25 [00:28<01:27,  4.58s/it]

F1_score - 0.9290100951057022, num_features - 11, AUC_score = 0.9049952948557088


 28%|██▊       | 7/25 [00:32<01:22,  4.57s/it]

F1_score - 0.9345060499209441, num_features - 31, AUC_score = 0.9139238662842804


 32%|███▏      | 8/25 [00:37<01:21,  4.78s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 36%|███▌      | 9/25 [00:43<01:22,  5.15s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 40%|████      | 10/25 [00:50<01:23,  5.58s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 44%|████▍     | 11/25 [00:57<01:25,  6.13s/it]

F1_score - 0.926224010731053, num_features - 36, AUC_score = 0.9005310091414231


 48%|████▊     | 12/25 [01:05<01:25,  6.58s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 52%|█████▏    | 13/25 [01:13<01:22,  6.87s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 56%|█████▌    | 14/25 [01:20<01:16,  6.97s/it]

F1_score - 0.9290100951057022, num_features - 36, AUC_score = 0.9049952948557088


 60%|██████    | 15/25 [01:27<01:10,  7.08s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 64%|██████▍   | 16/25 [01:35<01:05,  7.23s/it]

F1_score - 0.9296546462113844, num_features - 36, AUC_score = 0.9093027424269583


 68%|██████▊   | 17/25 [01:42<00:58,  7.33s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 72%|███████▏  | 18/25 [01:50<00:52,  7.48s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 76%|███████▌  | 19/25 [01:59<00:47,  7.84s/it]

F1_score - 0.931770654883567, num_features - 36, AUC_score = 0.9094595805699947


 80%|████████  | 20/25 [02:08<00:41,  8.39s/it]

F1_score - 0.9345060499209441, num_features - 36, AUC_score = 0.9139238662842804


 84%|████████▍ | 21/25 [02:19<00:35,  8.94s/it]

F1_score - 0.9268971302323933, num_features - 36, AUC_score = 0.9048384567126725


 88%|████████▊ | 22/25 [02:28<00:27,  9.08s/it]

F1_score - 0.9323871436056093, num_features - 36, AUC_score = 0.913767028141244


 92%|█████████▏| 23/25 [02:37<00:18,  9.16s/it]

F1_score - 0.9329922027290448, num_features - 36, AUC_score = 0.9180744757124933


 96%|█████████▌| 24/25 [02:47<00:09,  9.29s/it]

F1_score - 0.9296546462113844, num_features - 36, AUC_score = 0.9093027424269583


100%|██████████| 25/25 [02:57<00:00,  9.38s/it]


Best F1_score - 0.9345060499209441, num_features - 36


## Test data with upsampling

### Find feature importance

In [10]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = upsampling(1,X_train, y_train)

train_pool = Pool(data=X_train, label=y_train)

In [12]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9225387614267789, 'f1_score': 0.9356730254831562}

In [13]:
feature_importance.to_csv('../data/feature_imporstance/feature_imporstance.csv',index = False)

### Find best amount of features

In [14]:
search_num_features(df_fe, feature_importance, upsamp_func = True, step = 5)

  0%|          | 0/25 [00:00<?, ?it/s]

F1_score - 0.5030190363209421, num_features - 0, AUC_score = 0.7000358487184083


  4%|▍         | 1/25 [00:06<02:25,  6.08s/it]

F1_score - 0.8971506553910363, num_features - 1, AUC_score = 0.9194019985660513


  8%|▊         | 2/25 [00:12<02:23,  6.22s/it]

F1_score - 0.9133894516722362, num_features - 6, AUC_score = 0.9080480372826671


 12%|█▏        | 3/25 [00:18<02:17,  6.26s/it]

F1_score - 0.9187804265750446, num_features - 11, AUC_score = 0.9169766087112386


 16%|█▌        | 4/25 [00:26<02:16,  6.51s/it]

F1_score - 0.9173488073540239, num_features - 16, AUC_score = 0.90836171356874


 20%|██        | 5/25 [00:33<02:15,  6.80s/it]

F1_score - 0.9173488073540239, num_features - 16, AUC_score = 0.90836171356874


 24%|██▍       | 6/25 [00:41<02:14,  7.10s/it]

F1_score - 0.9268971302323933, num_features - 16, AUC_score = 0.9048384567126725


 28%|██▊       | 7/25 [00:49<02:12,  7.38s/it]

F1_score - 0.9267946579421356, num_features - 31, AUC_score = 0.9176039612833842


 32%|███▏      | 8/25 [00:58<02:11,  7.76s/it]

F1_score - 0.9329922027290448, num_features - 31, AUC_score = 0.9180744757124933


 36%|███▌      | 9/25 [01:07<02:10,  8.19s/it]

F1_score - 0.9247648902821317, num_features - 41, AUC_score = 0.9174471231403479


 40%|████      | 10/25 [01:16<02:08,  8.54s/it]

F1_score - 0.9267946579421356, num_features - 41, AUC_score = 0.9176039612833842


 44%|████▍     | 11/25 [01:26<02:04,  8.86s/it]

F1_score - 0.9315175832361914, num_features - 41, AUC_score = 0.9222250851407061


 48%|████▊     | 12/25 [01:35<01:58,  9.13s/it]

F1_score - 0.9288423001618658, num_features - 41, AUC_score = 0.9177607994264206


 52%|█████▏    | 13/25 [01:46<01:53,  9.49s/it]

F1_score - 0.9335861468177414, num_features - 41, AUC_score = 0.9223819232837426


 56%|█████▌    | 14/25 [01:57<01:49,  9.98s/it]

F1_score - 0.9335861468177414, num_features - 66, AUC_score = 0.9223819232837426


 60%|██████    | 15/25 [02:09<01:45, 10.51s/it]

F1_score - 0.9356730254831562, num_features - 66, AUC_score = 0.9225387614267789


 64%|██████▍   | 16/25 [02:21<01:39, 11.06s/it]

F1_score - 0.9275576538631527, num_features - 76, AUC_score = 0.9091459042839218


 68%|██████▊   | 17/25 [02:34<01:33, 11.68s/it]

F1_score - 0.9315175832361914, num_features - 76, AUC_score = 0.9222250851407061


 72%|███████▏  | 18/25 [02:47<01:24, 12.10s/it]

F1_score - 0.9377784739626844, num_features - 76, AUC_score = 0.9226955995698154


 76%|███████▌  | 19/25 [03:01<01:14, 12.49s/it]

F1_score - 0.9356730254831562, num_features - 91, AUC_score = 0.9225387614267789


 80%|████████  | 20/25 [03:14<01:03, 12.75s/it]

F1_score - 0.9309080644078345, num_features - 91, AUC_score = 0.9179176375694569


 84%|████████▍ | 21/25 [03:27<00:51, 12.94s/it]

F1_score - 0.935094971863561, num_features - 91, AUC_score = 0.9182313138555297


 88%|████████▊ | 22/25 [03:41<00:39, 13.23s/it]

F1_score - 0.9399027517255195, num_features - 91, AUC_score = 0.9228524377128516


 92%|█████████▏| 23/25 [03:56<00:27, 13.56s/it]

F1_score - 0.9366441795347409, num_features - 111, AUC_score = 0.9140807044273167


 96%|█████████▌| 24/25 [04:09<00:13, 13.66s/it]

F1_score - 0.9399027517255195, num_features - 111, AUC_score = 0.9228524377128516


100%|██████████| 25/25 [04:24<00:00, 13.99s/it]


Best F1_score - 0.9399027517255195, num_features - 111
