In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('train_steel.csv')
test_df = pd.read_csv('test_steel.csv')
sample_df = pd.read_csv('sample_submission_steel.csv')


In [3]:
print('The dimensions of the train dataset is: ', train_df.shape)
print('The dimensions of the test dataset is: ', test_df.shape)

The dimensions of the train dataset is:  (19219, 35)
The dimensions of the test dataset is:  (12814, 28)


In [4]:
train_df.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [5]:
train_df.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [6]:
test_df.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,...,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,...,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,...,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,...,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [7]:
sample_df.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,19220,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,19221,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,19222,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,19223,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [8]:
train_df.isnull().sum().sum()

0

In [9]:
test_df.isnull().sum().sum()

0

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19219 entries, 0 to 19218
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19219 non-null  int64  
 1   X_Minimum              19219 non-null  int64  
 2   X_Maximum              19219 non-null  int64  
 3   Y_Minimum              19219 non-null  int64  
 4   Y_Maximum              19219 non-null  int64  
 5   Pixels_Areas           19219 non-null  int64  
 6   X_Perimeter            19219 non-null  int64  
 7   Y_Perimeter            19219 non-null  int64  
 8   Sum_of_Luminosity      19219 non-null  int64  
 9   Minimum_of_Luminosity  19219 non-null  int64  
 10  Maximum_of_Luminosity  19219 non-null  int64  
 11  Length_of_Conveyer     19219 non-null  int64  
 12  TypeOfSteel_A300       19219 non-null  int64  
 13  TypeOfSteel_A400       19219 non-null  int64  
 14  Steel_Plate_Thickness  19219 non-null  int64  
 15  Ed

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12814 entries, 0 to 12813
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     12814 non-null  int64  
 1   X_Minimum              12814 non-null  int64  
 2   X_Maximum              12814 non-null  int64  
 3   Y_Minimum              12814 non-null  int64  
 4   Y_Maximum              12814 non-null  int64  
 5   Pixels_Areas           12814 non-null  int64  
 6   X_Perimeter            12814 non-null  int64  
 7   Y_Perimeter            12814 non-null  int64  
 8   Sum_of_Luminosity      12814 non-null  int64  
 9   Minimum_of_Luminosity  12814 non-null  int64  
 10  Maximum_of_Luminosity  12814 non-null  int64  
 11  Length_of_Conveyer     12814 non-null  int64  
 12  TypeOfSteel_A300       12814 non-null  int64  
 13  TypeOfSteel_A400       12814 non-null  int64  
 14  Steel_Plate_Thickness  12814 non-null  int64  
 15  Ed

In [12]:
train_df.duplicated().sum()

0

In [13]:
test_df.duplicated().sum()

0

In [14]:
train_corr = train_df.drop(['id'], axis=1).corr()
train_corr

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
X_Minimum,1.0,0.989767,0.016071,0.014247,-0.464759,-0.451961,-0.426514,-0.430073,0.337748,-0.132955,...,0.276871,-0.068844,-0.376973,0.119306,-0.107581,-0.510597,0.046159,0.044439,0.205123,0.169608
X_Maximum,0.989767,1.0,0.018909,0.016513,-0.391937,-0.378434,-0.356528,-0.363092,0.275571,-0.129669,...,0.237157,-0.079235,-0.319598,0.108808,-0.126807,-0.441177,0.036824,0.037949,0.18693,0.153448
Y_Minimum,0.016071,0.018909,1.0,0.969552,-0.00737,-0.012712,-0.024876,-0.008189,-0.017321,-0.02916,...,-0.044202,-0.023,-0.031511,-0.015512,-0.012143,0.010612,-0.039151,-0.005449,0.056442,-0.019227
Y_Maximum,0.014247,0.016513,0.969552,1.0,-0.007019,-0.012377,-0.024798,-0.007809,-0.018739,-0.028312,...,-0.04249,-0.022869,-0.030345,-0.016287,-0.012899,0.011304,-0.040874,-0.007416,0.05661,-0.018533
Pixels_Areas,-0.464759,-0.391937,-0.00737,-0.007019,1.0,0.835079,0.834543,0.797843,-0.622867,0.130499,...,-0.269013,-0.015081,0.542672,-0.101295,-0.081567,0.701014,-0.07747,-0.053548,-0.224037,-0.195931
X_Perimeter,-0.451961,-0.378434,-0.012712,-0.012377,0.835079,1.0,0.912579,0.802072,-0.603891,0.158036,...,-0.234969,0.005212,0.561256,-0.104322,-0.070433,0.675056,-0.084615,-0.050186,-0.218289,-0.186234
Y_Perimeter,-0.426514,-0.356528,-0.024876,-0.024798,0.834543,0.912579,1.0,0.809171,-0.602826,0.160831,...,-0.113968,-0.00565,0.608167,-0.064965,-0.062092,0.63027,-0.100204,-0.019334,-0.227256,-0.174581
Sum_of_Luminosity,-0.430073,-0.363092,-0.008189,-0.007809,0.797843,0.802072,0.809171,1.0,-0.580857,0.128569,...,-0.234625,-0.013845,0.520282,-0.09316,-0.059598,0.640597,-0.074245,-0.046712,-0.207316,-0.184651
Minimum_of_Luminosity,0.337748,0.275571,-0.017321,-0.018739,-0.622867,-0.603891,-0.602826,-0.580857,1.0,0.397265,...,0.179385,0.619174,-0.504692,-0.004622,0.061146,-0.586957,0.166976,0.077386,0.134339,0.208857
Maximum_of_Luminosity,-0.132955,-0.129669,-0.02916,-0.028312,0.130499,0.158036,0.160831,0.128569,0.397265,1.0,...,-0.113531,0.853856,0.018501,-0.08609,-0.076961,0.121755,0.115789,0.020471,-0.069808,-0.008368


In [15]:
# plt.figure(figsize=(12, 12))
# cols = train_corr.index
# cm = np.corrcoef(train_df[cols])
# sns.set(font_scale=0.8)
# corr_heat_map = sns.heatmap(cm, cbar=False, annot=True, fmt='.2f', yticklabels=cols.values, xticklabels=cols.values, cmap='Blues')
# corr_heat_map.xaxis.tick_top()
# plt.xticks(rotation=45, ha='left')
# plt.show()

In [16]:
# Feature Engineering
# train_df['X_range'] = (train_df['X_Maximum'] - train_df['X_Minimum'])
# train_df['Y_range'] = (train_df['Y_Maximum'] - train_df['Y_Minimum'])

# train_df['Luminosity_range'] = (train_df['Maximum_of_Luminosity'] - train_df['Minimum_of_Luminosity'])

# train_df['Areas'] = np.exp(train_df['LogOfAreas'])
# train_df['Y_Index'] = np.exp(train_df['Log_Y_Index'])

# train_df.head().T

In [17]:
# Feature Engineering Test
# test_df['X_range'] = (test_df['X_Maximum'] - test_df['X_Minimum'])
# test_df['Y_range'] = (test_df['Y_Maximum'] - test_df['Y_Minimum'])

# test_df['Luminosity_range'] = (test_df['Maximum_of_Luminosity'] - test_df['Minimum_of_Luminosity'])

# test_df['Areas'] = np.exp(test_df['LogOfAreas'])
# test_df['X_Index'] = np.exp(test_df['Log_X_Index'])
# test_df['Y_Index'] = np.exp(test_df['Log_Y_Index'])

# test_df.head().T

In [22]:
# preprocess the data 
X = train_df.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y = train_df[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [24]:
# call the model and train it
model = MultiOutputClassifier(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))

# define the parameters
param_grid = {
    'estimator__max_depth': [3, 5, 6, 7, 8],
    'estimator__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'estimator__n_estimators': [100, 300, 500],  # Add a comma here
    'estimator__min_child_weight': [1, 3, 5],
    'estimator__gamma': [0, 0.01, 0.1, 1],
    'estimator__subsample': [0.5, 0.6, 0.7],
    'estimator__colsample_bytree': [0.5, 0.7, 0.9],
}

# randomize the search with cross validation
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, scoring='roc_auc', 
                                   n_jobs=-1, cv=3, verbose=3, random_state=42)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [26]:
# print the best parameters
print('The best parameters are: ', random_search.best_params_)

# use the esttimator
best_model = random_search.best_estimator_

# predict the model
y_pred = best_model.predict_proba(X_test)

# prediction on test
test_scaled = scaler.transform(test_df.drop(['id'], axis=1))
predictions = best_model.predict_proba(test_scaled)

The best parameters are:  {'estimator__subsample': 0.6, 'estimator__n_estimators': 100, 'estimator__min_child_weight': 3, 'estimator__max_depth': 6, 'estimator__learning_rate': 0.2, 'estimator__gamma': 0.1, 'estimator__colsample_bytree': 0.9}


In [34]:
# create a submission file
probs = pd.DataFrame({class_: predictions[i][:, 1] for i, class_ in enumerate(y.columns)})
submission = test_df[['id']].join(probs)
submission.to_csv('submission.csv', index=False)

In [35]:
submission.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.4746,0.000301,0.004248,3e-05,0.020771,0.277481,0.311075
1,19220,0.344494,0.005195,0.000803,0.000201,0.036533,0.130817,0.225506
2,19221,0.000887,0.031799,0.042014,0.000396,0.001712,0.281968,0.493209
3,19222,0.035123,4.7e-05,8.3e-05,0.00032,0.003895,0.229664,0.415432
4,19223,0.000313,0.000735,0.00156,0.000392,0.005925,0.768606,0.183992
