In [None]:
import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [None]:
result_list = []
train_orig = pd.read_csv('./input/train.csv', index_col='id')
test_orig = pd.read_csv('./input/test.csv', index_col='id')
sample_submission = pd.read_csv('./input/sample_submission.csv')
print("Original train ds:")
display(train_orig.head())
print("Original test ds:")
display(test_orig.head())

In [None]:
print(f'Original train ds shape: {train_orig.shape}, contains missing values: {train_orig.isna().any().any()}')
print(f'Original train ds contains duplicates: {train_orig.duplicated().any()}')
print('Features with missing values:')
missing_values_per_feature = train_orig.isna().sum()
print(missing_values_per_feature[missing_values_per_feature > 0])

print(f'Original test ds shape: {test_orig.shape}, contains missing values: {test_orig.isna().any().any()}')
print(f'Original test ds contains duplicates: {test_orig.duplicated().any()}')

In [None]:
def cross_val(model, label):
    """Cross-validate the model with a StratifiedKFold
    
    The cross-validation score is printed and added to the global result_list"""
    start_time = datetime.datetime.now()
    kf = StratifiedKFold(shuffle=True, random_state=1)
    oof = np.full(len(train_orig), np.nan)
    auc_list = []
    for fold, (idx_tr, idx_va) in enumerate(kf.split(train_orig, train_orig.defects)):
        X_tr = train_orig.iloc[idx_tr]
        X_va = train_orig.iloc[idx_va]
        y_tr = X_tr.pop('defects')
        y_va = X_va.pop('defects')
        model.fit(X_tr, y_tr)
        #         print(np.round(model[-1].coef_, 2), np.round(model[-1].intercept_, 2))
        try:
            y_va_pred = model.predict_proba(X_va)[:, 1]
        except AttributeError:  # 'LinearSVC' object has no attribute 'predict_proba'
            y_va_pred = model.decision_function(X_va)
        oof[idx_va] = y_va_pred
        auc = roc_auc_score(y_va, y_va_pred)
        auc_list.append(auc)
    auc = np.array(auc_list).mean()
    execution_time = datetime.datetime.now() - start_time
    print(f"# AUC {auc:.5f}   time={str(execution_time)[-15:-7]}   {label}")
    result_list.append((auc, label, execution_time))

    return auc

In [None]:
score_list = []
for min_samples_leaf in [100, 150, 200, 250, 300]:
    auc = cross_val(XGBClassifier(n_estimators=100,
                                  min_samples_leaf=min_samples_leaf,
                                  max_features=1.0,
                                  random_state=42),
                    f"RF {min_samples_leaf=}")
    score_list.append((min_samples_leaf, auc))


In [None]:
pipeline = Pipeline([
    ('classifier', XGBClassifier())
])

param_grid = {
    'classifier__n_estimators': [100,300,500,700,900,1000,1200],
    'classifier__max_depth': [2,3,4,5],
    'classifier__learning_rate': [0.0025,0.005,0.01,0.02]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=1, verbose=2, error_score="raise",
                           scoring='roc_auc')

grid_search.fit(train_orig.drop(['defects'], axis=1), train_orig.defects)

print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best cross-validation accuracy score: {:.2f}".format(grid_search.best_score_))

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_

In [10]:
# result processing
y_pred = best_model.predict_proba(test_orig)[:,1]
submission = pd.Series(y_pred, index=test_orig.index, name='defects')
submission.to_csv('solution-v1_0.csv')
# result = pd.concat([test_orig['id'], pd.DataFrame(final_prediction)], axis=1)
# result.columns = ['id', 'outcome']
# result.to_csv('solution-v1_0.csv', index=False)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
