In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [None]:
train_orig = pd.read_csv('./input/train.csv', index_col='id')
test_orig = pd.read_csv('./input/test.csv', index_col='id')
sample_submission = pd.read_csv('./input/sample_submission.csv')
print("Original train ds:")
display(train_orig.head())
print("Original test ds:")
display(test_orig.head())

In [16]:
print(f'Original train ds shape: {train_orig.shape}, contains missing values: {train_orig.isna().any().any()}')
print(f'Original train ds contains duplicates: {train_orig.duplicated().any()}')
missing_values_per_feature = train_orig.isna().sum()
if missing_values_per_feature.sum() > 0:
    print('Features with missing values:')
    print(missing_values_per_feature[missing_values_per_feature > 0])

print(f'Original test ds shape: {test_orig.shape}, contains missing values: {test_orig.isna().any().any()}')
print(f'Original test ds contains duplicates: {test_orig.duplicated().any()}')

Original train ds shape: (101763, 22), contains missing values: False
Original train ds contains duplicates: False
Original test ds shape: (67842, 21), contains missing values: False
Original test ds contains duplicates: False


In [None]:
pipeline = Pipeline([
    ('classifier', XGBClassifier())
])

param_grid = {
    'classifier__n_estimators': [100,300,500,700,900,1000,1200],
    'classifier__max_depth': [2,3,4,5],
    'classifier__learning_rate': [0.0025,0.005,0.01,0.02]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=1, verbose=2, error_score="raise",
                           scoring='roc_auc')
# colab
# Best hyperparameters found:
# {'classifier__learning_rate': 0.02, 'classifier__max_depth': 4, 'classifier__n_estimators': 700, 'classifier__subsample': 0.5}
# Best cross-validation accuracy score: 0.79

grid_search.fit(train_orig.drop(['defects'], axis=1), train_orig.defects)

print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best cross-validation accuracy score: {:.2f}".format(grid_search.best_score_))

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_

In [None]:
# result processing
y_pred = best_model.predict_proba(test_orig)[:,1]
submission = pd.Series(data=y_pred, index=test_orig.index, name='defects')
submission.to_csv('solution-v1_0.csv')