In [1]:
# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import VotingClassifier, StackingClassifier

# Target encoding/decoding
from sklearn.base import BaseEstimator, TransformerMixin

# Metrics
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, auc, roc_curve, log_loss

# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, plot_importance
from catboost import CatBoostClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Math and DataFrame
import pandas as pd
import numpy as np

# Warnings ignore
import warnings
warnings.filterwarnings("ignore")

# Load Original Dataset

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')
original = pd.read_csv('/kaggle/input/steel-plates-faults-dataset/steel_plates_faults_original_dataset.csv')
train.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [3]:
train.shape

(19219, 35)

In [4]:
train = pd.concat([train, original], axis=0).drop_duplicates()
train.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [5]:
train.shape

(21160, 35)

In [6]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
train.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [7]:
test.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,1015,1033,3826564,3826588,659,23,46,62357,67,127,...,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,1257,1271,419960,419973,370,26,28,39293,92,132,...,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,1358,1372,117715,117724,289,36,32,29386,101,134,...,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,158,168,232415,232440,80,10,11,8586,107,140,...,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,559,592,544375,544389,140,19,15,15524,103,134,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [8]:
target_variables = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',  'Other_Faults']

In [9]:
X = train.drop(target_variables, axis=1)
target = train[target_variables]
X.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,584,590,909972,909977,16,8,5,2274,113,140,...,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998


In [10]:
target.head()

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1


# Z-Scale

In [11]:
scaler = StandardScaler()
X_sc = scaler.fit_transform(X)
test_sc = scaler.transform(test)
X_sc

array([[-0.2126337 , -0.30209065, -0.48683739, ..., -1.22916955,
         1.03325447, -1.29602889],
       [ 0.2083857 ,  0.14887371, -0.58278763, ...,  1.3109518 ,
        -1.31430165,  1.12935668],
       [-1.23698891, -1.09626683,  0.20105939, ..., -0.22797026,
         0.35162739,  1.2822575 ],
       ...,
       [-1.03775652, -1.13218435, -0.76323834, ..., -1.19398951,
         1.15984235,  0.40480503],
       [-1.05279293, -1.14016602, -0.74436865, ..., -0.33044234,
         0.97888659,  1.25792554],
       [ 1.05982224,  1.07674286, -0.92110808, ..., -0.61556426,
         0.19339253, -0.13079844]])

# Make sure it is a multiclass problem

In [12]:
# If the sum of the axis is greater than 1 because in the same row more than 1 columns value is 1, we can sure that is not a multi class problem, it is multi label problem.
out = target[target.drop(columns=target_variables).sum(axis=1).gt(1)]
out

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults


# One-Hot-Encoded to Label-Encoded

In [13]:
label_encoded_targets = np.argmax(target, axis=-1)
label_encoded_targets

array([3, 6, 2, ..., 6, 6, 6])

In [14]:
unique_classes = np.unique(label_encoded_targets)
unique_classes

array([0, 1, 2, 3, 4, 5, 6])

# Compute class weights

### The dataset is unbalanced, so we need to compute class weights for each classes

In [15]:
class_weights = compute_class_weight(class_weight="balanced", classes=unique_classes, y=label_encoded_targets)
class_weights

array([1.23786124, 2.25586354, 0.79090977, 4.72321429, 5.5978836 ,
       0.58548463, 0.41908459])

In [16]:
class_weights_param = {key: value for key, value in zip(unique_classes, class_weights)}
class_weights_param

{0: 1.237861237861238,
 1: 2.255863539445629,
 2: 0.7909097705016073,
 3: 4.723214285714286,
 4: 5.597883597883598,
 5: 0.5854846296450015,
 6: 0.4190845893327524}

# Optuna tuned params

In [17]:
TUNE = False

In [18]:
import optuna

# Define the objective function for Optuna optimization
def objective(trial, X_train, y_train, X_test, y_test):
    # Define parameters to be optimized for the LGBMClassifier
    param = {
        "class_weight": class_weights_param,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        "num_class": 7,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.03),
        "n_estimators": trial.suggest_int("n_estimators", 400, 600),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.025),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
    }

    # Create an instance of LGBMClassifier with the suggested parameters
    lgbm_classifier = LGBMClassifier(**param)
    
    # Fit the classifier on the training data
    lgbm_classifier.fit(X_train, y_train)

    # Evaluate the classifier on the test data
    score = lgbm_classifier.score(X_test, y_test)
    
    print(f'SCORE: {score}')

    return score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sc, label_encoded_targets, test_size=0.2, random_state=42)  # Adjust the test_size as needed

# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
study = optuna.create_study(direction="maximize", sampler=sampler)

# If TUNE
if TUNE:
    # Run the optimization process
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=150)

    # Get the best parameters after optimization
    best_params = study.best_params

[I 2024-03-05 14:51:01,358] A new study created in memory with name: no-name-a4f06f30-ae77-454b-a522-fcd8c8c88a66


In [19]:
# Define the objective function for Optuna optimization
def objective(trial, X_train, y_train, X_test, y_test):
    # Define parameters to be optimized for the XGBClassifier
    param = {
        "objective": 'multi:softmax',
        "booster": 'gbtree',
        "random_state": 42,
        "num_class": 7,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'gamma' : trial.suggest_float('gamma', 1e-9, 1.0),
        'subsample': trial.suggest_float('subsample', 0.25, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.25, 1.0),
        'max_depth': trial.suggest_int('max_depth', 0, 24),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
    }

    # Create an instance of LGBMClassifier with the suggested parameters
    xgb_classifier = XGBClassifier(**param)
    
    # Fit the classifier on the training data
    xgb_classifier.fit(X_train, y_train)

    # Evaluate the classifier on the test data
    score = xgb_classifier.score(X_test, y_test)
    y_pred_prob = xgb_classifier.predict_proba(X_test)
    lgbm_log_loss = log_loss(y_test, y_pred_prob)
    print(f'SCORE: {score}')
    print(f'Log Loss: {lgbm_log_loss}')

    return score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sc, label_encoded_targets, test_size=0.2, random_state=42)  # Adjust the test_size as needed

# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
study = optuna.create_study(direction="maximize", sampler=sampler)

if TUNE:

    # Run the optimization process
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=150)

    # Get the best parameters after optimization
    best_params = study.best_params

[I 2024-03-05 14:51:01,424] A new study created in memory with name: no-name-7c9ce639-90f6-4865-85ed-8261a3d1e820


In [20]:
lgbm_params: dict = {
    "class_weight": class_weights_param, # Balanced class weight
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": 42,       # Random state for reproducibility
    "num_class": 7,                     # Number of classes in the dataset
    'n_estimators': 752,
    'learning_rate': 0.005613916463106189,
    'max_depth': 6,
    'num_leaves': 252,
    'subsample': 0.6045538705062335,
    'colsample_bytree': 0.866501494211133,
    'colsample_bynode': 0.5443098861233086,
    'reg_alpha': 0.0024787490924597882,
    'reg_lambda': 3.5334079815178954,
    'min_split_gain': 0.05539710161546875,
}

In [21]:
xgb_params: dict = {
    'class_weight': class_weights_param,
    'objective':'multi:softmax',
    'n_estimators': 829,
    'learning_rate': 0.010260565670497695,
    'gamma': 0.16282691057583543,
    'reg_alpha': 0.010492176264956674,
    'reg_lambda': 0.437536781187624,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.6971737476610285,
    'colsample_bytree': 0.5115061295805807,
    'random_state': 345,
}

In [22]:
cat_params: dict = {
    'class_weights': class_weights_param,
    'learning_rate': 0.13762007048684638,
    'depth': 5, 
    'l2_leaf_reg': 5.285199432056192,
    'bagging_temperature': 0.6029582154263095,
    'random_seed': 42,
    'verbose': False,
    'iterations':1000,
}

In [23]:
estimators = [
    ('XGB', XGBClassifier(**xgb_params)),
    ('LGBM', LGBMClassifier(**lgbm_params)),
    ('CAT', CatBoostClassifier(**cat_params))
]

# Optuna Voting Weights Tuning

In [24]:
WEIGHT_TUNE = False

In [25]:
# Define the objective function for Optuna optimization
def objective(trial, X_train, y_train, X_test, y_test):
    # Define parameters to be optimized for the weighted ensemble
    
    obj_estimators = [
        ('XGB', XGBClassifier(**xgb_params)),
        ('LGBM', LGBMClassifier(**lgbm_params)),
        ('CAT', CatBoostClassifier(**cat_params))
    ]
    
    voting_classifier = VotingClassifier(
        estimators=obj_estimators,
        voting='soft',
        weights=[
            trial.suggest_float('XGB_Weight', 1.00, 9.00),
            trial.suggest_float('LGBM_Weight', 0.25, 5.00),
            trial.suggest_float('CAT_Weight', 0.25, 2.00),
        ],
    )
    
    voting_classifier.fit(X_train, y_train)
    
    predict_probs = voting_classifier.predict_proba(X_test)
    
    auc_score = roc_auc_score(y_test, predict_probs, multi_class='ovr')

    return auc_score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sc, label_encoded_targets, test_size=0.2, random_state=42)  # Adjust the test_size as needed

# Set up the sampler for Optuna optimization
weight_sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
weight_study = optuna.create_study(direction="maximize", sampler=weight_sampler)

if WEIGHT_TUNE:

    # Run the optimization process
    weight_study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=300)

    # Get the best parameters after optimization
    weight_best_params = weight_study.best_params

[I 2024-03-05 14:51:01,769] A new study created in memory with name: no-name-45d49953-3c84-40b4-9bc3-27e59e998e8d


In [26]:
# weight_best_params
# {'XGB_Weight': 7.994761950304625, 'LGBM_Weight': 0.46883567511191715,'CAT_Weight': 0.4412142916220983}

# Voting Classifier

In [27]:
voting_estimators = [
    ('XGB', XGBClassifier(**xgb_params)),
    ('LGBM', LGBMClassifier(**lgbm_params)),
    ('CAT', CatBoostClassifier(**cat_params))
]

In [28]:
voting_classifier = VotingClassifier(
    estimators=voting_estimators,
    voting='soft',
    weights=[
            7.994761950304625,
            0.46883567511191715,
            0.4412142916220983
        ],
    )
    
voting_classifier.fit(X_sc, label_encoded_targets)

voting_predicted_probs = voting_classifier.predict_proba(test_sc)

# Stacking Classifier

In [29]:
stacking_estimators = [
    ('XGB', XGBClassifier(**xgb_params)),
    ('LGBM', LGBMClassifier(**lgbm_params)),
    ('CAT', CatBoostClassifier(**cat_params))
]

In [30]:
stacking_classifier = StackingClassifier(
    estimators = stacking_estimators[1:],
    cv=10,
    final_estimator=stacking_estimators[0][1]
)
stacking_classifier.fit(X_sc, label_encoded_targets)

In [31]:
stacking_predicted_probs = stacking_classifier.predict_proba(test_sc)
stacking_predicted_probs

array([[6.02386653e-01, 2.59243394e-03, 3.46971513e-03, ...,
        3.86944599e-02, 8.81510749e-02, 2.62917638e-01],
       [2.94828236e-01, 5.78220654e-03, 5.77266607e-03, ...,
        1.78428173e-01, 1.59372509e-01, 3.53271842e-01],
       [1.09716624e-01, 4.19683307e-02, 3.75740454e-02, ...,
        6.72611455e-03, 3.05618227e-01, 4.96372551e-01],
       ...,
       [1.01431990e-02, 8.58855667e-04, 9.37421083e-01, ...,
        8.33032653e-04, 1.25902635e-03, 4.87043113e-02],
       [4.08887088e-01, 5.81973465e-03, 2.15831529e-02, ...,
        8.10695812e-02, 1.36555701e-01, 3.43908876e-01],
       [1.59967393e-02, 1.47087756e-03, 9.05393243e-01, ...,
        1.19159499e-03, 1.89477636e-03, 7.31161609e-02]], dtype=float32)

In [32]:
all_predicted_probs = np.array([voting_predicted_probs, stacking_predicted_probs])
predicted_probs = np.mean(all_predicted_probs, axis=0)

In [33]:
predicted_probs.shape

(12814, 7)

# Submission

In [34]:
submission_df = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')
submission_df

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,19220,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,19221,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,19222,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,19223,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...
12809,32028,0.5,0.5,0.5,0.5,0.5,0.5,0.5
12810,32029,0.5,0.5,0.5,0.5,0.5,0.5,0.5
12811,32030,0.5,0.5,0.5,0.5,0.5,0.5,0.5
12812,32031,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [35]:
submission_df[target_variables] = predicted_probs
submission_df

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.569575,0.003017,0.004707,0.001692,0.031912,0.117304,0.271793
1,19220,0.284438,0.015541,0.009194,0.002238,0.187193,0.157372,0.344025
2,19221,0.105566,0.046012,0.039660,0.002285,0.009256,0.297947,0.499274
3,19222,0.174647,0.004060,0.002107,0.002196,0.011798,0.396861,0.408332
4,19223,0.054523,0.003913,0.001907,0.002396,0.007798,0.607320,0.322142
...,...,...,...,...,...,...,...,...
12809,32028,0.166052,0.106228,0.005092,0.002027,0.040326,0.232685,0.447590
12810,32029,0.185663,0.005770,0.020647,0.010428,0.182098,0.176465,0.418929
12811,32030,0.010990,0.000941,0.935503,0.000730,0.000932,0.001372,0.049533
12812,32031,0.425730,0.010685,0.022364,0.001950,0.083465,0.133067,0.322739


In [36]:
submission_df.to_csv("submission.csv", index=False)