In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
train_original = pd.read_csv("/kaggle/input/steel-fault/faults.csv")

train.drop(columns=['id'], axis=1, inplace=True)
target_cols = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
train = train[train[target_cols].sum(axis=1) == 1]

In [3]:
test = pd.concat([train_original, train], axis=0)

In [4]:
def feature_generator(data):

    "A function to generate additional features"

    epsilon = 1e-6  # A small constant to avoid division by zero or taking the logarithm of zero
    # Location Features
    data['X_Distance'] = data['X_Maximum'] - data['X_Minimum']
    data['Y_Distance'] = data['Y_Maximum'] - data['Y_Minimum']

    # Density Feature
    data['Density'] = data['Pixels_Areas'] / (data['X_Perimeter'] + data['Y_Perimeter'])

    # Relative Perimeter Feature
    data['Relative_Perimeter'] = data['X_Perimeter'] / (data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)

    # Circularity Feature
    data['Circularity'] = data['Pixels_Areas'] / (data['X_Perimeter'] ** 2)

    # Symmetry Index Feature
    data['Symmetry_Index'] = np.abs(data['X_Distance'] - data['Y_Distance']) / (data['X_Distance'] + data['Y_Distance'] + epsilon)
    
    # Color Contrast Feature
    data['Color_Contrast'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

    # Combined Geometric Index Feature
    data['Combined_Geometric_Index'] = data['Edges_Index'] * data['Square_Index']

    # Interaction Term Feature
    data['X_Distance*Pixels_Areas'] = data['X_Distance'] * data['Pixels_Areas']
    
    # Additional Features
    data['sin_orientation'] = np.sin(data['Orientation_Index'])
    data['Edges_Index2'] = np.exp(data['Edges_Index'] + epsilon)
    data['X_Maximum2'] = np.sin(data['X_Maximum'])
    data['Y_Minimum2'] = np.sin(data['Y_Minimum'])
    data['Aspect_Ratio_Pixels'] = np.where(data['Y_Perimeter'] == 0, 0, data['X_Perimeter'] / data['Y_Perimeter'])
    data['Aspect_Ratio'] = np.where(data['Y_Distance'] == 0, 0, data['X_Distance'] / data['Y_Distance'])

    # Average Luminosity Feature
    data['Average_Luminosity'] = (data['Sum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2
    
    # Normalized Steel Thickness Feature
    data['Normalized_Steel_Thickness'] = (data['Steel_Plate_Thickness'] - data['Steel_Plate_Thickness'].min()) / (data['Steel_Plate_Thickness'].max() - data['Steel_Plate_Thickness'].min())

    # Logarithmic Features
    data['Log_Perimeter'] = np.log(data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)
    data['Log_Luminosity'] = np.log(data['Sum_of_Luminosity'] + epsilon)
    data['Log_Aspect_Ratio'] = np.log(data['Aspect_Ratio'] ** 2 + epsilon)

    # Statistical Features
    data['Combined_Index'] = data['Orientation_Index'] * data['Luminosity_Index']
    data['Sigmoid_Areas'] = 1 / (1 + np.exp(-data['LogOfAreas'] + epsilon))

    return data

test = feature_generator(test)

In [5]:
# Initialize the scaler
scaler = RobustScaler()

# Identify discrete features
discrete_features = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# Identify continuous features
continuous_features = [col for col in test.columns if col not in discrete_features + target_cols]

# Fit and transform the scaler on training data
test[continuous_features] = scaler.fit_transform(test[continuous_features])

In [6]:
X = test.drop(columns=target_cols, axis=1)
y1 = test['Pastry']
y2 = test['Z_Scratch']
y3 = test['K_Scatch']
y4 = test['Stains']
y5 = test['Dirtiness']
y6 = test['Bumps']
y7 = test['Other_Faults']

In [7]:
X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry = train_test_split(X, y1, test_size=0.3, random_state=24)
X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch = train_test_split(X, y2, test_size=0.3, random_state=24)
X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch = train_test_split(X, y3, test_size=0.3, random_state=24)
X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains = train_test_split(X, y4, test_size=0.3, random_state=24)
X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness = train_test_split(X, y5, test_size=0.3, random_state=24)
X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps = train_test_split(X, y6, test_size=0.3, random_state=24)
X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults = train_test_split(X, y7, test_size=0.3, random_state=24)

In [8]:
# Assuming target_splits is the dictionary containing splits for each target
target_splits = {
    'Pastry': (X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry),
    'Z_Scratch': (X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch),
    'K_Scatch': (X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch),
    'Stains': (X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains),
    'Dirtiness': (X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness),
    'Bumps': (X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps),
    'Other_Faults': (X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults)
}

In [9]:
# Define the objective function
def objective_cat(trial, X_train, X_test, y_train, y_test):
    # Define the search space for CatBoost hyperparameters
    cat_params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 1, 10),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100)
    }
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(X_train, y_train, verbose=False)
    predictions_cat = model_cat.predict(X_test)
    accuracy_cat = roc_auc_score(y_test, predictions_cat)
    return accuracy_cat

# Dictionary to store best models for each target
best_models_cat = {}

# Loop over each target
for target, (X_train, X_test, y_train, y_test) in target_splits.items():
    print(f"Tuning catBoost for target: {target}")
    study_cat = optuna.create_study(direction='maximize')
    study_cat.optimize(lambda trial: objective_cat(trial, X_train, X_test, y_train, y_test), n_trials=30)
    print('Best hyperparameters:', study_cat.best_params)
    print('Best accuracy:', study_cat.best_value)

    # Define the best hyperparameters obtained from Optuna
    best_params_cat = study_cat.best_params

    # Instantiate CatBoostClassifier with the best hyperparameters
    best_cat_model = CatBoostClassifier(**best_params_cat, silent=True)

    # Fit the model on the training data
    best_cat_model.fit(X_train, y_train)

    # Save the model to a file
    model_filename = f"best_cat_model_{target}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(best_cat_model, f)

    # Store the model in the dictionary
    best_models_cat[target] = best_cat_model

[I 2024-03-22 07:19:17,466] A new study created in memory with name: no-name-3f07d658-5e47-4260-828d-866cf2e8c62d


Tuning catBoost for target: Pastry


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:19:22,984] Trial 0 finished with value: 0.5357799899193548 and parameters: {'iterations': 812, 'learning_rate': 0.010717023490815806, 'depth': 4, 'colsample_bylevel': 0.9629101163702997, 'min_data_in_leaf': 90}. Best is trial 0 with value: 0.5357799899193548.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:19:23,588] Trial 1 finished with value: 0.5 and parameters: {'iterations': 176, 'learning_rate': 0.024457246970361116, 'depth': 1, 'colsample_bylevel': 0.6209023708827925, 'min_data_in_leaf': 25}. Best is trial 0 with value: 0.5357799899193548.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:19:36,346] Trial 2 finished with value: 0.5307488315615836 and parameters: {'iterations': 363, 'learning_rate': 0.009467406752512167, 'depth': 8, 'colsample_bylevel': 0.7322663480425987, 'min_data_in_leaf': 19}. Best is tr

Best hyperparameters: {'iterations': 802, 'learning_rate': 0.05678722412985588, 'depth': 4, 'colsample_bylevel': 0.9983570273594571, 'min_data_in_leaf': 44}
Best accuracy: 0.5718362429740959


[I 2024-03-22 07:23:32,444] A new study created in memory with name: no-name-96d4d9a7-80f5-4e47-af06-7a5bdfa76910


Tuning catBoost for target: Z_Scratch


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:23:33,025] Trial 0 finished with value: 0.5 and parameters: {'iterations': 147, 'learning_rate': 0.0012776208670579382, 'depth': 2, 'colsample_bylevel': 0.15834735195771082, 'min_data_in_leaf': 82}. Best is trial 0 with value: 0.5.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:24:00,353] Trial 1 finished with value: 0.7680838520853364 and parameters: {'iterations': 388, 'learning_rate': 0.009370012841881386, 'depth': 10, 'colsample_bylevel': 0.8164083365119765, 'min_data_in_leaf': 59}. Best is trial 1 with value: 0.7680838520853364.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:24:19,757] Trial 2 finished with value: 0.7867916516842758 and parameters: {'iterations': 874, 'learning_rate': 0.013132491661405808, 'depth': 7, 'colsample_bylevel': 0.8700355075699263, 'min_data_in_leaf': 64}. Best is trial 2 with v

Best hyperparameters: {'iterations': 736, 'learning_rate': 0.022923681272539378, 'depth': 7, 'colsample_bylevel': 0.6803981283970291, 'min_data_in_leaf': 31}
Best accuracy: 0.7893830740210867


[I 2024-03-22 07:29:14,353] A new study created in memory with name: no-name-64b4b417-4982-43c0-97df-61d6ebcd0c35


Tuning catBoost for target: K_Scatch


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:29:20,581] Trial 0 finished with value: 0.9452923826977182 and parameters: {'iterations': 112, 'learning_rate': 0.010483307945068068, 'depth': 9, 'colsample_bylevel': 0.6422751869012349, 'min_data_in_leaf': 13}. Best is trial 0 with value: 0.9452923826977182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:29:21,659] Trial 1 finished with value: 0.9458703059923742 and parameters: {'iterations': 313, 'learning_rate': 0.013843168636461118, 'depth': 2, 'colsample_bylevel': 0.2606045200797125, 'min_data_in_leaf': 65}. Best is trial 1 with value: 0.9458703059923742.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:29:30,957] Trial 2 finished with value: 0.9463104468374447 and parameters: {'iterations': 497, 'learning_rate': 0.011362406585966446, 'depth': 9, 'colsample_bylevel': 0.0891817848639715, 'min_data_in_leaf': 

Best hyperparameters: {'iterations': 270, 'learning_rate': 0.034010414270675714, 'depth': 3, 'colsample_bylevel': 0.959531000297769, 'min_data_in_leaf': 28}
Best accuracy: 0.949492218884766


[I 2024-03-22 07:32:05,431] A new study created in memory with name: no-name-9bb719dd-8811-4383-b8b3-f99e1e8d4d99


Tuning catBoost for target: Stains


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:32:09,630] Trial 0 finished with value: 0.8473476641872361 and parameters: {'iterations': 295, 'learning_rate': 0.004049051441739557, 'depth': 7, 'colsample_bylevel': 0.22081025667617482, 'min_data_in_leaf': 55}. Best is trial 0 with value: 0.8473476641872361.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:32:16,899] Trial 1 finished with value: 0.8759024999863567 and parameters: {'iterations': 265, 'learning_rate': 0.045040360180877724, 'depth': 8, 'colsample_bylevel': 0.3918093247380377, 'min_data_in_leaf': 72}. Best is trial 1 with value: 0.8759024999863567.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:32:27,920] Trial 2 finished with value: 0.8659956923759785 and parameters: {'iterations': 478, 'learning_rate': 0.003695479844331286, 'depth': 8, 'colsample_bylevel': 0.27964981255511645, 'min_data_in_leaf'

Best hyperparameters: {'iterations': 152, 'learning_rate': 0.05863176526516703, 'depth': 7, 'colsample_bylevel': 0.482063332503265, 'min_data_in_leaf': 29}
Best accuracy: 0.8842207802838884


[I 2024-03-22 07:36:13,265] A new study created in memory with name: no-name-83ed13be-e5d3-4205-a569-ff1e0335af90


Tuning catBoost for target: Dirtiness


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:36:21,191] Trial 0 finished with value: 0.5765795558966386 and parameters: {'iterations': 696, 'learning_rate': 0.015374548195333045, 'depth': 6, 'colsample_bylevel': 0.9741357312495873, 'min_data_in_leaf': 12}. Best is trial 0 with value: 0.5765795558966386.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:36:22,981] Trial 1 finished with value: 0.5592900202966747 and parameters: {'iterations': 394, 'learning_rate': 0.015783859058028815, 'depth': 3, 'colsample_bylevel': 0.35423544690202685, 'min_data_in_leaf': 48}. Best is trial 0 with value: 0.5765795558966386.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:36:24,764] Trial 2 finished with value: 0.5764109218325575 and parameters: {'iterations': 237, 'learning_rate': 0.08217874214312754, 'depth': 4, 'colsample_bylevel': 0.5945363370878708, 'min_data_in_leaf': 

Best hyperparameters: {'iterations': 644, 'learning_rate': 0.038431940012130274, 'depth': 5, 'colsample_bylevel': 0.9896581623124204, 'min_data_in_leaf': 7}
Best accuracy: 0.5850556896325393


[I 2024-03-22 07:39:41,474] A new study created in memory with name: no-name-20026ca3-ef43-4666-8d1b-4fde83fa35a5


Tuning catBoost for target: Bumps


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:39:45,993] Trial 0 finished with value: 0.5318491738880237 and parameters: {'iterations': 939, 'learning_rate': 0.0010822987759161308, 'depth': 5, 'colsample_bylevel': 0.1519299993482784, 'min_data_in_leaf': 44}. Best is trial 0 with value: 0.5318491738880237.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:39:47,201] Trial 1 finished with value: 0.5546063322823146 and parameters: {'iterations': 462, 'learning_rate': 0.01789602073516144, 'depth': 1, 'colsample_bylevel': 0.23611890526039808, 'min_data_in_leaf': 68}. Best is trial 1 with value: 0.5546063322823146.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:40:32,479] Trial 2 finished with value: 0.6162447818397471 and parameters: {'iterations': 672, 'learning_rate': 0.00230508807976584, 'depth': 10, 'colsample_bylevel': 0.8517489472279165, 'min_data_in_leaf':

Best hyperparameters: {'iterations': 787, 'learning_rate': 0.09489892690258873, 'depth': 4, 'colsample_bylevel': 0.37248411701882106, 'min_data_in_leaf': 8}
Best accuracy: 0.6614115026753844


[I 2024-03-22 07:43:42,237] A new study created in memory with name: no-name-c174dce1-7437-4d03-9187-f2dac78b1384


Tuning catBoost for target: Other_Faults


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:43:45,452] Trial 0 finished with value: 0.6020451081621548 and parameters: {'iterations': 259, 'learning_rate': 0.01785546878627443, 'depth': 6, 'colsample_bylevel': 0.8837215747618214, 'min_data_in_leaf': 50}. Best is trial 0 with value: 0.6020451081621548.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:44:00,226] Trial 1 finished with value: 0.617414235142156 and parameters: {'iterations': 186, 'learning_rate': 0.02520329890706095, 'depth': 10, 'colsample_bylevel': 0.9366187396126514, 'min_data_in_leaf': 58}. Best is trial 1 with value: 0.617414235142156.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2024-03-22 07:44:00,634] Trial 2 finished with value: 0.5 and parameters: {'iterations': 127, 'learning_rate': 0.001447459934858467, 'depth': 1, 'colsample_bylevel': 0.3379315063655902, 'min_data_in_leaf': 46}. Best is trial

Best hyperparameters: {'iterations': 909, 'learning_rate': 0.054022829202738445, 'depth': 7, 'colsample_bylevel': 0.8443558887299824, 'min_data_in_leaf': 27}
Best accuracy: 0.6227254261337908
