In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 
import optuna
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
train_original = pd.read_csv("/kaggle/input/original/faults.csv")

train.drop(columns=['id'], axis=1, inplace=True)
target_cols = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
train = train[train[target_cols].sum(axis=1) == 1]

In [3]:
test = pd.concat([train_original, train], axis=0)

In [4]:
def feature_generator(data):

    "A function to generate additional features"

    epsilon = 1e-6  # A small constant to avoid division by zero or taking the logarithm of zero
    # Location Features
    data['X_Distance'] = data['X_Maximum'] - data['X_Minimum']
    data['Y_Distance'] = data['Y_Maximum'] - data['Y_Minimum']

    # Density Feature
    data['Density'] = data['Pixels_Areas'] / (data['X_Perimeter'] + data['Y_Perimeter'])

    # Relative Perimeter Feature
    data['Relative_Perimeter'] = data['X_Perimeter'] / (data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)

    # Circularity Feature
    data['Circularity'] = data['Pixels_Areas'] / (data['X_Perimeter'] ** 2)

    # Symmetry Index Feature
    data['Symmetry_Index'] = np.abs(data['X_Distance'] - data['Y_Distance']) / (data['X_Distance'] + data['Y_Distance'] + epsilon)
    
    # Color Contrast Feature
    data['Color_Contrast'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

    # Combined Geometric Index Feature
    data['Combined_Geometric_Index'] = data['Edges_Index'] * data['Square_Index']

    # Interaction Term Feature
    data['X_Distance*Pixels_Areas'] = data['X_Distance'] * data['Pixels_Areas']
    
    # Additional Features
    data['sin_orientation'] = np.sin(data['Orientation_Index'])
    data['Edges_Index2'] = np.exp(data['Edges_Index'] + epsilon)
    data['X_Maximum2'] = np.sin(data['X_Maximum'])
    data['Y_Minimum2'] = np.sin(data['Y_Minimum'])
    data['Aspect_Ratio_Pixels'] = np.where(data['Y_Perimeter'] == 0, 0, data['X_Perimeter'] / data['Y_Perimeter'])
    data['Aspect_Ratio'] = np.where(data['Y_Distance'] == 0, 0, data['X_Distance'] / data['Y_Distance'])

    # Average Luminosity Feature
    data['Average_Luminosity'] = (data['Sum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2
    
    # Normalized Steel Thickness Feature
    data['Normalized_Steel_Thickness'] = (data['Steel_Plate_Thickness'] - data['Steel_Plate_Thickness'].min()) / (data['Steel_Plate_Thickness'].max() - data['Steel_Plate_Thickness'].min())

    # Logarithmic Features
    data['Log_Perimeter'] = np.log(data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)
    data['Log_Luminosity'] = np.log(data['Sum_of_Luminosity'] + epsilon)
    data['Log_Aspect_Ratio'] = np.log(data['Aspect_Ratio'] ** 2 + epsilon)

    # Statistical Features
    data['Combined_Index'] = data['Orientation_Index'] * data['Luminosity_Index']
    data['Sigmoid_Areas'] = 1 / (1 + np.exp(-data['LogOfAreas'] + epsilon))

    return data

test = feature_generator(test)

In [5]:
# Initialize the scaler
scaler = RobustScaler()

# Identify discrete features
discrete_features = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# Identify continuous features
continuous_features = [col for col in test.columns if col not in discrete_features + target_cols]

# Fit and transform the scaler on training data
test[continuous_features] = scaler.fit_transform(test[continuous_features])

In [6]:
X = test.drop(columns=target_cols, axis=1)
y1 = test['Pastry']
y2 = test['Z_Scratch']
y3 = test['K_Scatch']
y4 = test['Stains']
y5 = test['Dirtiness']
y6 = test['Bumps']
y7 = test['Other_Faults']

In [7]:
X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry = train_test_split(X, y1, test_size=0.3, random_state=24)
X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch = train_test_split(X, y2, test_size=0.3, random_state=24)
X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch = train_test_split(X, y3, test_size=0.3, random_state=24)
X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains = train_test_split(X, y4, test_size=0.3, random_state=24)
X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness = train_test_split(X, y5, test_size=0.3, random_state=24)
X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps = train_test_split(X, y6, test_size=0.3, random_state=24)
X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults = train_test_split(X, y7, test_size=0.3, random_state=24)

In [8]:
# Assuming target_splits is the dictionary containing splits for each target
target_splits = {
    'Pastry': (X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry),
    'Z_Scratch': (X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch),
    'K_Scatch': (X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch),
    'Stains': (X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains),
    'Dirtiness': (X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness),
    'Bumps': (X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps),
    'Other_Faults': (X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults)
}

In [9]:
# Define the objective function for XGBoost
def objective_xgb(trial, X_train, X_test, y_train, y_test):
    params = {
        "objective": "binary:logistic",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "eval_metric": 'auc',
    }

    model_xgb = XGBClassifier(**params)
    model_xgb.fit(X_train, y_train, verbose=False)
    predictions_xgb = model_xgb.predict(X_test)
    accuracy_xgb = roc_auc_score(y_test, predictions_xgb)
    return accuracy_xgb

# Dictionary to store best models for each target
best_models = {}

# Loop over each target
for target, (X_train, X_test, y_train, y_test) in target_splits.items():
    print(f"Tuning XGBoost for target: {target}")
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(lambda trial: objective_xgb(trial, X_train, X_test, y_train, y_test), n_trials=30)
    print('Best hyperparameters:', study_xgb.best_params)
    print('Best accuracy:', study_xgb.best_value)

    # Define the best hyperparameters obtained from Optuna
    best_params_xgb = study_xgb.best_params

    # Instantiate XGBClassifier with the best hyperparameters
    best_xgb_model = XGBClassifier(**best_params_xgb, silent=True)

    # Fit the model on the training data
    best_xgb_model.fit(X_train, y_train)

    # Save the model to a file
    model_filename = f"best_xmodel_{target}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(best_xgb_model, f)

    # Store the model in the dictionary
    best_models[target] = best_xgb_model

[I 2024-03-22 07:19:03,682] A new study created in memory with name: no-name-99d032cd-9e15-4926-8f3e-2ae5adac33a3


Tuning XGBoost for target: Pastry


[I 2024-03-22 07:19:07,197] Trial 0 finished with value: 0.5280755513807429 and parameters: {'learning_rate': 0.004125116120145079, 'max_depth': 7, 'subsample': 0.1362532293324489, 'min_child_weight': 20}. Best is trial 0 with value: 0.5280755513807429.
[I 2024-03-22 07:19:13,413] Trial 1 finished with value: 0.5291408922898337 and parameters: {'learning_rate': 0.0024678343302905236, 'max_depth': 7, 'subsample': 0.9078645676541163, 'min_child_weight': 18}. Best is trial 1 with value: 0.5291408922898337.
[I 2024-03-22 07:19:16,850] Trial 2 finished with value: 0.577932742240958 and parameters: {'learning_rate': 0.03458306058458436, 'max_depth': 4, 'subsample': 0.7276464219661168, 'min_child_weight': 18}. Best is trial 2 with value: 0.577932742240958.
[I 2024-03-22 07:19:19,679] Trial 3 finished with value: 0.5366876298264908 and parameters: {'learning_rate': 0.011481501755318934, 'max_depth': 8, 'subsample': 0.08283174143624582, 'min_child_weight': 18}. Best is trial 2 with value: 0.577

Best hyperparameters: {'learning_rate': 0.07237673586512304, 'max_depth': 10, 'subsample': 0.36163904642587036, 'min_child_weight': 14}
Best accuracy: 0.5939239293132942


[I 2024-03-22 07:21:23,655] A new study created in memory with name: no-name-5782335d-d8e7-49f0-abda-241a6c917e4e


Tuning XGBoost for target: Z_Scratch


[I 2024-03-22 07:21:27,707] Trial 0 finished with value: 0.7682358011562581 and parameters: {'learning_rate': 0.018413402851699887, 'max_depth': 5, 'subsample': 0.41987642834499844, 'min_child_weight': 13}. Best is trial 0 with value: 0.7682358011562581.
[I 2024-03-22 07:21:31,177] Trial 1 finished with value: 0.760221603375097 and parameters: {'learning_rate': 0.0612320049181548, 'max_depth': 7, 'subsample': 0.22170396738243958, 'min_child_weight': 17}. Best is trial 0 with value: 0.7682358011562581.
[I 2024-03-22 07:21:36,396] Trial 2 finished with value: 0.753391120733844 and parameters: {'learning_rate': 0.09667678170167153, 'max_depth': 7, 'subsample': 0.5582092042635219, 'min_child_weight': 7}. Best is trial 0 with value: 0.7682358011562581.
[I 2024-03-22 07:21:41,902] Trial 3 finished with value: 0.7819367194811884 and parameters: {'learning_rate': 0.020338390121824063, 'max_depth': 7, 'subsample': 0.8621380113341255, 'min_child_weight': 9}. Best is trial 3 with value: 0.7819367

Best hyperparameters: {'learning_rate': 0.004858940512730382, 'max_depth': 8, 'subsample': 0.8977095596627609, 'min_child_weight': 1}
Best accuracy: 0.7902388766904599


[I 2024-03-22 07:24:22,148] A new study created in memory with name: no-name-b960f257-b214-4dc9-bdd5-c7d73e45a4a4


Tuning XGBoost for target: K_Scatch


[I 2024-03-22 07:24:25,646] Trial 0 finished with value: 0.9395335553940454 and parameters: {'learning_rate': 0.004014226223959587, 'max_depth': 8, 'subsample': 0.1020491225112092, 'min_child_weight': 8}. Best is trial 0 with value: 0.9395335553940454.
[I 2024-03-22 07:24:29,651] Trial 1 finished with value: 0.9405988186729731 and parameters: {'learning_rate': 0.06572350107926127, 'max_depth': 5, 'subsample': 0.6820744097497977, 'min_child_weight': 11}. Best is trial 1 with value: 0.9405988186729731.
[I 2024-03-22 07:24:33,139] Trial 2 finished with value: 0.9255128381658628 and parameters: {'learning_rate': 0.0012004244780604095, 'max_depth': 9, 'subsample': 0.22102203340111798, 'min_child_weight': 20}. Best is trial 1 with value: 0.9405988186729731.
[I 2024-03-22 07:24:40,708] Trial 3 finished with value: 0.9459073023101316 and parameters: {'learning_rate': 0.006837855453286933, 'max_depth': 9, 'subsample': 0.8291868808367941, 'min_child_weight': 6}. Best is trial 3 with value: 0.945

Best hyperparameters: {'learning_rate': 0.0034113578507053064, 'max_depth': 4, 'subsample': 0.7211187069711512, 'min_child_weight': 16}
Best accuracy: 0.9472915146594137


[I 2024-03-22 07:25:59,299] A new study created in memory with name: no-name-6ac67e5a-0be0-4c05-a109-8d38c7c835cc


Tuning XGBoost for target: Stains


[I 2024-03-22 07:26:02,582] Trial 0 finished with value: 0.8790982003663663 and parameters: {'learning_rate': 0.005426838963096274, 'max_depth': 5, 'subsample': 0.6887156476828166, 'min_child_weight': 20}. Best is trial 0 with value: 0.8790982003663663.
[I 2024-03-22 07:26:04,939] Trial 1 finished with value: 0.8790136123101999 and parameters: {'learning_rate': 0.015412276599219437, 'max_depth': 2, 'subsample': 0.4855074208062172, 'min_child_weight': 1}. Best is trial 0 with value: 0.8790982003663663.
[I 2024-03-22 07:26:11,559] Trial 2 finished with value: 0.8708645081250012 and parameters: {'learning_rate': 0.0032114765323886193, 'max_depth': 10, 'subsample': 0.7325533078918277, 'min_child_weight': 1}. Best is trial 0 with value: 0.8790982003663663.
[I 2024-03-22 07:26:15,586] Trial 3 finished with value: 0.8314723960976709 and parameters: {'learning_rate': 0.001757739719109201, 'max_depth': 9, 'subsample': 0.5544594256776342, 'min_child_weight': 13}. Best is trial 0 with value: 0.87

Best hyperparameters: {'learning_rate': 0.09696991752037765, 'max_depth': 1, 'subsample': 0.8505546385401402, 'min_child_weight': 9}
Best accuracy: 0.8897663004822429


[I 2024-03-22 07:27:20,951] A new study created in memory with name: no-name-4d875871-949f-4b41-899c-16fac722e134


Tuning XGBoost for target: Dirtiness


[I 2024-03-22 07:27:23,210] Trial 0 finished with value: 0.5881340186406278 and parameters: {'learning_rate': 0.04994482828982846, 'max_depth': 2, 'subsample': 0.4547860218523173, 'min_child_weight': 9}. Best is trial 0 with value: 0.5881340186406278.
[I 2024-03-22 07:27:28,042] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.001392121846971164, 'max_depth': 7, 'subsample': 0.49460216926631934, 'min_child_weight': 12}. Best is trial 0 with value: 0.5881340186406278.
[I 2024-03-22 07:27:33,630] Trial 2 finished with value: 0.5881340186406278 and parameters: {'learning_rate': 0.011092360985342995, 'max_depth': 7, 'subsample': 0.757076545965407, 'min_child_weight': 5}. Best is trial 0 with value: 0.5881340186406278.
[I 2024-03-22 07:27:36,525] Trial 3 finished with value: 0.5869535801920611 and parameters: {'learning_rate': 0.09926510588270429, 'max_depth': 3, 'subsample': 0.6619446389954492, 'min_child_weight': 3}. Best is trial 0 with value: 0.5881340186406278.
[I 2

Best hyperparameters: {'learning_rate': 0.007054717716395659, 'max_depth': 8, 'subsample': 0.6092898391486851, 'min_child_weight': 5}
Best accuracy: 0.5942063596247639


[I 2024-03-22 07:29:49,018] A new study created in memory with name: no-name-13933f13-69c2-4836-9b4e-bc77e319573f


Tuning XGBoost for target: Bumps


[I 2024-03-22 07:29:52,245] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.0010711503182031915, 'max_depth': 2, 'subsample': 0.44853657395525687, 'min_child_weight': 12}. Best is trial 0 with value: 0.5.
[I 2024-03-22 07:29:56,068] Trial 1 finished with value: 0.5028533681140334 and parameters: {'learning_rate': 0.0013541162127093619, 'max_depth': 4, 'subsample': 0.3424058150835391, 'min_child_weight': 8}. Best is trial 1 with value: 0.5028533681140334.
[I 2024-03-22 07:30:00,809] Trial 2 finished with value: 0.6445928111444567 and parameters: {'learning_rate': 0.005095046575119053, 'max_depth': 6, 'subsample': 0.266934326704405, 'min_child_weight': 19}. Best is trial 2 with value: 0.6445928111444567.
[I 2024-03-22 07:30:03,156] Trial 3 finished with value: 0.5689707473488861 and parameters: {'learning_rate': 0.003910084848309026, 'max_depth': 2, 'subsample': 0.3256826379227937, 'min_child_weight': 1}. Best is trial 2 with value: 0.6445928111444567.
[I 2024-03-22 

Best hyperparameters: {'learning_rate': 0.015983190981867654, 'max_depth': 5, 'subsample': 0.7488410377541124, 'min_child_weight': 10}
Best accuracy: 0.666258359257055


[I 2024-03-22 07:32:15,226] A new study created in memory with name: no-name-2b550f96-6df5-4ff2-9ac5-9529dac89103


Tuning XGBoost for target: Other_Faults


[I 2024-03-22 07:32:18,069] Trial 0 finished with value: 0.6172119919897832 and parameters: {'learning_rate': 0.05124428588142677, 'max_depth': 3, 'subsample': 0.34592442513901644, 'min_child_weight': 12}. Best is trial 0 with value: 0.6172119919897832.
[I 2024-03-22 07:32:26,074] Trial 1 finished with value: 0.5995075123421839 and parameters: {'learning_rate': 0.07300648736178333, 'max_depth': 10, 'subsample': 0.19668093565286304, 'min_child_weight': 6}. Best is trial 0 with value: 0.6172119919897832.
[I 2024-03-22 07:32:32,828] Trial 2 finished with value: 0.5824688060961578 and parameters: {'learning_rate': 0.0017559953435704163, 'max_depth': 6, 'subsample': 0.3408136605918744, 'min_child_weight': 7}. Best is trial 0 with value: 0.6172119919897832.
[I 2024-03-22 07:32:36,978] Trial 3 finished with value: 0.5845469882766013 and parameters: {'learning_rate': 0.0022144766974570864, 'max_depth': 5, 'subsample': 0.11635760990160135, 'min_child_weight': 14}. Best is trial 0 with value: 0.

Best hyperparameters: {'learning_rate': 0.03677057309198029, 'max_depth': 6, 'subsample': 0.9444080669829852, 'min_child_weight': 9}
Best accuracy: 0.6287396164401865
