In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 
import optuna
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
train_original = pd.read_csv("/kaggle/input/faults/faults.csv")

train.drop(columns=['id'], axis=1, inplace=True)
target_cols = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
train = train[train[target_cols].sum(axis=1) == 1]

In [None]:
test = pd.concat([train_original, train], axis=0)

In [None]:
def feature_generator(data):

    "A function to generate additional features"

    epsilon = 1e-6  # A small constant to avoid division by zero or taking the logarithm of zero
    # Location Features
    data['X_Distance'] = data['X_Maximum'] - data['X_Minimum']
    data['Y_Distance'] = data['Y_Maximum'] - data['Y_Minimum']

    # Density Feature
    data['Density'] = data['Pixels_Areas'] / (data['X_Perimeter'] + data['Y_Perimeter'])

    # Relative Perimeter Feature
    data['Relative_Perimeter'] = data['X_Perimeter'] / (data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)

    # Circularity Feature
    data['Circularity'] = data['Pixels_Areas'] / (data['X_Perimeter'] ** 2)

    # Symmetry Index Feature
    data['Symmetry_Index'] = np.abs(data['X_Distance'] - data['Y_Distance']) / (data['X_Distance'] + data['Y_Distance'] + epsilon)
    
    # Color Contrast Feature
    data['Color_Contrast'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

    # Combined Geometric Index Feature
    data['Combined_Geometric_Index'] = data['Edges_Index'] * data['Square_Index']

    # Interaction Term Feature
    data['X_Distance*Pixels_Areas'] = data['X_Distance'] * data['Pixels_Areas']
    
    # Additional Features
    data['sin_orientation'] = np.sin(data['Orientation_Index'])
    data['Edges_Index2'] = np.exp(data['Edges_Index'] + epsilon)
    data['X_Maximum2'] = np.sin(data['X_Maximum'])
    data['Y_Minimum2'] = np.sin(data['Y_Minimum'])
    data['Aspect_Ratio_Pixels'] = np.where(data['Y_Perimeter'] == 0, 0, data['X_Perimeter'] / data['Y_Perimeter'])
    data['Aspect_Ratio'] = np.where(data['Y_Distance'] == 0, 0, data['X_Distance'] / data['Y_Distance'])

    # Average Luminosity Feature
    data['Average_Luminosity'] = (data['Sum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2
    
    # Normalized Steel Thickness Feature
    data['Normalized_Steel_Thickness'] = (data['Steel_Plate_Thickness'] - data['Steel_Plate_Thickness'].min()) / (data['Steel_Plate_Thickness'].max() - data['Steel_Plate_Thickness'].min())

    # Logarithmic Features
    data['Log_Perimeter'] = np.log(data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)
    data['Log_Luminosity'] = np.log(data['Sum_of_Luminosity'] + epsilon)
    data['Log_Aspect_Ratio'] = np.log(data['Aspect_Ratio'] ** 2 + epsilon)

    # Statistical Features
    data['Combined_Index'] = data['Orientation_Index'] * data['Luminosity_Index']
    data['Sigmoid_Areas'] = 1 / (1 + np.exp(-data['LogOfAreas'] + epsilon))

    return data

test = feature_generator(test)

In [None]:
# Initialize the scaler
scaler = RobustScaler()

# Identify discrete features
discrete_features = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# Identify continuous features
continuous_features = [col for col in test.columns if col not in discrete_features + target_cols]

# Fit and transform the scaler on training data
test[continuous_features] = scaler.fit_transform(test[continuous_features])

In [None]:
target_cols

In [None]:
X = test.drop(columns=target_cols, axis=1)
y1 = test['Pastry']
y2 = test['Z_Scratch']
y3 = test['K_Scatch']
y4 = test['Stains']
y5 = test['Dirtiness']
y6 = test['Bumps']
y7 = test['Other_Faults']

In [None]:
X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry = train_test_split(X, y1, test_size=0.3, random_state24)
X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch = train_test_split(X, y2, test_size=0.3, random_state24)
X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch = train_test_split(X, y3, test_size=0.3, random_state24)
X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains = train_test_split(X, y4, test_size=0.3, random_state24)
X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness = train_test_split(X, y5, test_size=0.3, random_state24)
X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps = train_test_split(X, y6, test_size=0.3, random_state24)
X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults = train_test_split(X, y7, test_size=0.3, random_state24)

In [None]:
# Assuming target_splits is the dictionary containing splits for each target
target_splits = {
    'Pastry': (X_train_Pastry, X_test_pastry, y_train_Pastry, y_test_pastry),
    'Z_Scratch': (X_train_Z_Scratch, X_test_Z_Scratch, y_train_Z_Scratch, y_test_Z_Scratch),
    'K_Scatch': (X_train_K_Scatch, X_test_K_Scatch, y_train_K_Scatch, y_test_K_Scatch),
    'Stains': (X_train_Stains, X_test_Stains, y_train_Stains, y_test_Stains),
    'Dirtiness': (X_train_Dirtiness, X_test_Dirtiness, y_train_Dirtiness, y_test_Dirtiness),
    'Bumps': (X_train_Bumps, X_test_Bumps, y_train_Bumps, y_test_Bumps),
    'Other_Faults': (X_train_Other_Faults, X_test_Other_Faults, y_train_Other_Faults, y_test_Other_Faults)
}

# Optuna for xgb

In [None]:
# Define the objective function for XGBoost
def objective_xgb(trial, X_train, X_test, y_train, y_test):
    params = {
        "objective": "binary:logistic",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "eval_metric": 'auc',
    }

    model_xgb = XGBClassifier(**params)
    model_xgb.fit(X_train, y_train, verbose=False)
    predictions_xgb = model_xgb.predict(X_test)
    accuracy_xgb = roc_auc_score(y_test, predictions_xgb)
    return accuracy_xgb

# Dictionary to store best models for each target
best_models = {}

# Loop over each target
for target, (X_train, X_test, y_train, y_test) in target_splits.items():
    print(f"Tuning XGBoost for target: {target}")
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(lambda trial: objective_xgb(trial, X_train, X_test, y_train, y_test), n_trials=30)
    print('Best hyperparameters:', study_xgb.best_params)
    print('Best accuracy:', study_xgb.best_value)

    # Define the best hyperparameters obtained from Optuna
    best_params_xgb = study_xgb.best_params

    # Instantiate XGBClassifier with the best hyperparameters
    best_xgb_model = xgb.XGBClassifier(**best_params_xgb, silent=True)

    # Fit the model on the training data
    best_xgb_model.fit(X_train, y_train)

    # Save the model to a file
    model_filename = f"best_model_{target}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(best_xgb_model, f)

    # Store the model in the dictionary
    best_models[target] = best_xgb_model

## Optuna for lightgbm

In [None]:
def objective_lgbm(trial):
    """
    Objective function to be minimized.
    """
    lgbm_param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    model_lgbm = LGBMClassifier(**params)
    model_lgbm.fit(X_train, y_train, verbose=False)
    predictions_lgbm = model_lgbm.predict(X_test)
    accuracy_lgbm = roc_auc_score(y_test, predictions_lgbm)
    return accuracy_lgbm


# Dictionary to store best models for each target
best_models_lgbm = {}

# Loop over each target
for target, (X_train, X_test, y_train, y_test) in target_splits.items():
    print(f"Tuning LGBM for target: {target}")
    study_lgbm = optuna.create_study(direction='maximize')
    study_lgbm.optimize(lambda trial: objective_lgbm(trial, X_train, X_test, y_train, y_test), n_trials=30)
    print('Best hyperparameters:', study_lgbm.best_params)
    print('Best accuracy:', study_lgbm.best_value)

    # Define the best hyperparameters obtained from Optuna
    best_params_lgbm = study_lgbm.best_params

    # Instantiate XGBClassifier with the best hyperparameters
    best_lgbm_model = LGBMClassifier(**best_params_lgbm, silent=True)

    # Fit the model on the training data
    best_lgbm_model.fit(X_train, y_train)

    # Save the model to a file
    model_filename = f"best_lightgbm_model_{target}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(best_lgbm_model, f)

    # Store the model in the dictionary
    best_models_lgbm[target] = best_lgbm_model


## Optuna for catboost

In [None]:
# Define the objective function
def objective_cat(trial):
    # Define the search space for CatBoost hyperparameters
    cat_params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 1, 10),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100)
    }
    model_cat = CatBoostClassifier(**params)
    model_cat.fit(X_train, y_train, verbose=False)
    predictions_cat = model_cat.predict(X_test)
    accuracy_cat = roc_auc_score(y_test, predictions_cat)
    return accuracy_cat

# Dictionary to store best models for each target
best_models_cat = {}

# Loop over each target
for target, (X_train, X_test, y_train, y_test) in target_splits.items():
    print(f"Tuning catBoost for target: {target}")
    study_cat = optuna.create_study(direction='maximize')
    study_cat.optimize(lambda trial: objective_cat(trial, X_train, X_test, y_train, y_test), n_trials=30)
    print('Best hyperparameters:', study_cat.best_params)
    print('Best accuracy:', study_cat.best_value)

    # Define the best hyperparameters obtained from Optuna
    best_params_cat = study_cat.best_params

    # Instantiate XGBClassifier with the best hyperparameters
    best_cat_model = CatBoostClassifier(**best_params_cat, silent=True)

    # Fit the model on the training data
    best_cat_model.fit(X_train, y_train)

    # Save the model to a file
    model_filename = f"best_cat_model_{target}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(best_cat_model, f)

    # Store the model in the dictionary
    best_models_cat[target] = best_cat_model