# Load libraries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from sklearn import metrics
from tqdm import tqdm
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool
from src.sampling_methods.sampler import DataSampler
from typing import Optional, Literal

import warnings
warnings.filterwarnings("ignore")

In [2]:
def check_result(model,X_test,y_test):
  metrics_dict = {}
  fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict(X_test))
  auc_score = metrics.auc(fpr, tpr)
  f1_sc = metrics.f1_score(y_test, model.predict(X_test), average='macro')
  metrics_dict['auc_score'] = auc_score
  metrics_dict['f1_score'] = f1_sc
  return metrics_dict

In [3]:
def search_num_features(df, feature_importance, upsamp_func: Optional[Literal['ROS', 'RUS', 'SMOTE', 'OSS']] = None, p = 0.3, step = 5):
  best_score = 0
  best_num_features = 0

  for num_col in tqdm(range(1, len(feature_importance), step)):
    features = list(feature_importance.iloc[:num_col,:]['feature_names'])
    data = df[features + ['Machine failure']]
    X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = ['Machine failure']),
                                                        data['Machine failure'],
                                                        test_size=0.33,
                                                        random_state=42,
                                                        stratify = df['Machine failure'])
    
    if upsamp_func != 'ROS':
      X_train, y_train = DataSampler.ROS(X_train, y_train, p, separation = True)
    elif upsamp_func != 'RUS':
      X_train, y_train = DataSampler.RUS(X_train, y_train, p, separation = True)
    elif upsamp_func != 'SMOTE':
      X_train, y_train = DataSampler.SMOTE(X_train, y_train, p, separation = True)
    elif upsamp_func != 'OSS':
      X_train, y_train = DataSampler.OSS(X_train, y_train, separation = True)
    
    train_pool = Pool(data=X_train, label=y_train)
    CatBoost = CatBoostClassifier(verbose=False,random_seed=42)
    CatBoost.fit(train_pool)
    metrics_dict = check_result(CatBoost, X_test, y_test)
    print(f'F1_score - {metrics_dict["f1_score"]}, num_features - {best_num_features}, AUC_score = {metrics_dict["auc_score"]}')
    if metrics_dict['f1_score'] > best_score:
      best_score = metrics_dict['f1_score']
      best_num_features = num_col
  print(f'Best F1_score - {best_score}, num_features - {best_num_features}')


# Read data

In [4]:
df = pd.read_csv('../data/prepared_data.csv')
df_fe = pd.read_csv('../data/data_fe.csv')

# Train Catboost

## Test data without upsampling

### Find feature importance

In [5]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])

train_pool = Pool(data=X_train, label=y_train)

In [7]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test )

{'auc_score': 0.9003741709983868, 'f1_score': 0.9241142395957508}

In [8]:
feature_importance.to_csv('../data/feature_importance/feature_importance.csv', index = False)

### Find best amount of features

In [9]:
search_num_features(df_fe,feature_importance, step = 5)

  4%|▍         | 1/25 [00:02<00:55,  2.33s/it]

F1_score - 0.6549366900395834, num_features - 0, AUC_score = 0.6799605664097508


  8%|▊         | 2/25 [00:04<00:55,  2.42s/it]

F1_score - 0.790241794004729, num_features - 1, AUC_score = 0.7877419788492562


 12%|█▏        | 3/25 [00:07<00:55,  2.52s/it]

F1_score - 0.9098669135383634, num_features - 6, AUC_score = 0.8992763039971321


 16%|█▌        | 4/25 [00:10<00:55,  2.66s/it]

F1_score - 0.9227281641206964, num_features - 11, AUC_score = 0.9045247804265998


 20%|██        | 5/25 [00:13<00:56,  2.81s/it]

F1_score - 0.9153605015673982, num_features - 16, AUC_score = 0.9082048754257035


 24%|██▍       | 6/25 [00:16<00:57,  3.00s/it]

F1_score - 0.9220717326480798, num_features - 16, AUC_score = 0.912982837426062


 28%|██▊       | 7/25 [00:20<00:58,  3.25s/it]

F1_score - 0.9302871920016027, num_features - 16, AUC_score = 0.9136101899982075


 32%|███▏      | 8/25 [00:24<00:59,  3.53s/it]

F1_score - 0.9420461230905222, num_features - 31, AUC_score = 0.9230092758558881


 36%|███▌      | 9/25 [00:28<00:59,  3.74s/it]

F1_score - 0.9420461230905222, num_features - 36, AUC_score = 0.9230092758558881


 40%|████      | 10/25 [00:33<00:59,  3.97s/it]

F1_score - 0.9377784739626844, num_features - 36, AUC_score = 0.9226955995698154


 44%|████▍     | 11/25 [00:37<00:57,  4.14s/it]

F1_score - 0.9468756128329787, num_features - 36, AUC_score = 0.9276303997132103


 48%|████▊     | 12/25 [00:42<00:55,  4.28s/it]

F1_score - 0.935094971863561, num_features - 51, AUC_score = 0.9182313138555297


 52%|█████▏    | 13/25 [00:47<00:54,  4.52s/it]

F1_score - 0.935094971863561, num_features - 51, AUC_score = 0.9182313138555297


 56%|█████▌    | 14/25 [00:53<00:55,  5.01s/it]

F1_score - 0.953911664759183, num_features - 51, AUC_score = 0.9324083617135689


 60%|██████    | 15/25 [01:00<00:56,  5.61s/it]

F1_score - 0.9388018061256055, num_features - 66, AUC_score = 0.9142375425703531


 64%|██████▍   | 16/25 [01:07<00:54,  6.06s/it]

F1_score - 0.9431766711411607, num_features - 66, AUC_score = 0.9145512188564259


 68%|██████▊   | 17/25 [01:14<00:51,  6.38s/it]

F1_score - 0.9329922027290448, num_features - 66, AUC_score = 0.9180744757124933


 72%|███████▏  | 18/25 [01:22<00:47,  6.74s/it]

F1_score - 0.9323871436056093, num_features - 66, AUC_score = 0.913767028141244


 76%|███████▌  | 19/25 [01:29<00:41,  6.97s/it]

F1_score - 0.9388018061256055, num_features - 66, AUC_score = 0.9142375425703531


 80%|████████  | 20/25 [01:37<00:36,  7.23s/it]

F1_score - 0.9399027517255195, num_features - 66, AUC_score = 0.9228524377128516


 84%|████████▍ | 21/25 [01:45<00:29,  7.48s/it]

F1_score - 0.9399027517255195, num_features - 66, AUC_score = 0.9228524377128516


 88%|████████▊ | 22/25 [01:53<00:22,  7.66s/it]

F1_score - 0.9377784739626844, num_features - 66, AUC_score = 0.9226955995698154


 92%|█████████▏| 23/25 [02:01<00:15,  7.76s/it]

F1_score - 0.9393574536305039, num_features - 66, AUC_score = 0.9185449901416025


 96%|█████████▌| 24/25 [02:10<00:07,  7.98s/it]

F1_score - 0.9399027517255195, num_features - 66, AUC_score = 0.9228524377128516


100%|██████████| 25/25 [02:19<00:00,  5.57s/it]

F1_score - 0.9442088573400635, num_features - 66, AUC_score = 0.9231661139989245
Best F1_score - 0.953911664759183, num_features - 66





## Test data with upsampling

### Find feature importance

In [10]:
CatBoost = CatBoostClassifier(verbose=False,random_seed=42)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_fe.drop(columns = ['Machine failure']),
                                                    df_fe['Machine failure'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify = df['Machine failure'])
X_train, y_train = DataSampler.ROS(X_train, y_train, 1, separation = True)

train_pool = Pool(data=X_train, label=y_train)

In [13]:
CatBoost.fit(train_pool)

feature_importance = pd.DataFrame({'feature_importance': CatBoost.get_feature_importance(train_pool), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)
              
check_result(CatBoost, X_test, y_test)

{'auc_score': 0.9182313138555297, 'f1_score': 0.935094971863561}

In [14]:
feature_importance.to_csv('../data/feature_importance/feature_importance_ROS.csv', index = False)

### Find best amount of features

In [15]:
search_num_features(df_fe, feature_importance, upsamp_func = 'ROS', step = 5)

  4%|▍         | 1/25 [00:02<00:51,  2.14s/it]

F1_score - 0.6721066267795615, num_features - 0, AUC_score = 0.6149735615701739


  8%|▊         | 2/25 [00:04<00:54,  2.36s/it]

F1_score - 0.8975533368805821, num_features - 1, AUC_score = 0.8855697705682022


 12%|█▏        | 3/25 [00:07<00:58,  2.66s/it]

F1_score - 0.9016320143080707, num_features - 6, AUC_score = 0.8774253898548127


 16%|█▌        | 4/25 [00:10<00:55,  2.64s/it]

F1_score - 0.9007172427072301, num_features - 11, AUC_score = 0.8731179422835634


 20%|██        | 5/25 [00:13<00:56,  2.81s/it]

F1_score - 0.9079029567659125, num_features - 11, AUC_score = 0.8991194658540959


 24%|██▍       | 6/25 [00:16<00:54,  2.89s/it]

F1_score - 0.9206716322776858, num_features - 21, AUC_score = 0.9043679422835634


 28%|██▊       | 7/25 [00:20<00:56,  3.13s/it]

F1_score - 0.9062482926918103, num_features - 26, AUC_score = 0.890504570711597


 32%|███▏      | 8/25 [00:24<00:57,  3.40s/it]

F1_score - 0.9345060499209441, num_features - 26, AUC_score = 0.9139238662842804


 36%|███▌      | 9/25 [00:28<00:57,  3.62s/it]

F1_score - 0.9186333890281259, num_features - 36, AUC_score = 0.904211104140527


 40%|████      | 10/25 [00:32<00:58,  3.87s/it]

F1_score - 0.9302871920016027, num_features - 36, AUC_score = 0.9136101899982075


 44%|████▍     | 11/25 [00:37<00:57,  4.08s/it]

F1_score - 0.9393574536305039, num_features - 36, AUC_score = 0.9185449901416025


 48%|████▊     | 12/25 [00:42<00:56,  4.35s/it]

F1_score - 0.9171500809300457, num_features - 51, AUC_score = 0.8955962089980284


 52%|█████▏    | 13/25 [00:47<00:54,  4.58s/it]

F1_score - 0.9248032425018249, num_features - 51, AUC_score = 0.9046816185696361


 56%|█████▌    | 14/25 [00:52<00:51,  4.66s/it]

F1_score - 0.9309080644078345, num_features - 51, AUC_score = 0.9179176375694569


 60%|██████    | 15/25 [00:57<00:48,  4.82s/it]

F1_score - 0.9283531940475505, num_features - 51, AUC_score = 0.9006878472844596


 64%|██████▍   | 16/25 [01:02<00:44,  4.98s/it]

F1_score - 0.919951838792265, num_features - 51, AUC_score = 0.900060494712314


 68%|██████▊   | 17/25 [01:08<00:40,  5.12s/it]

F1_score - 0.9220236055812194, num_features - 51, AUC_score = 0.9002173328553504


 72%|███████▏  | 18/25 [01:13<00:36,  5.28s/it]

F1_score - 0.9409792085848425, num_features - 51, AUC_score = 0.9143943807133895


 76%|███████▌  | 19/25 [01:19<00:33,  5.56s/it]

F1_score - 0.9282059314954052, num_features - 86, AUC_score = 0.9134533518551712


 80%|████████  | 20/25 [01:27<00:31,  6.30s/it]

F1_score - 0.9302871920016027, num_features - 86, AUC_score = 0.9136101899982075


 84%|████████▍ | 21/25 [01:36<00:27,  6.99s/it]

F1_score - 0.9309080644078345, num_features - 86, AUC_score = 0.9179176375694569


 88%|████████▊ | 22/25 [01:45<00:22,  7.48s/it]

F1_score - 0.933905950615654, num_features - 86, AUC_score = 0.909616418713031


 92%|█████████▏| 23/25 [01:53<00:15,  7.68s/it]

F1_score - 0.9296546462113844, num_features - 86, AUC_score = 0.9093027424269583


 96%|█████████▌| 24/25 [02:01<00:07,  7.74s/it]

F1_score - 0.926224010731053, num_features - 86, AUC_score = 0.9005310091414231


100%|██████████| 25/25 [02:09<00:00,  5.17s/it]

F1_score - 0.9372166333480656, num_features - 86, AUC_score = 0.9183881519985662
Best F1_score - 0.9409792085848425, num_features - 86



