# Define Dataset

In [10]:
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df_data = pd.read_csv(data_url)
col_data = df_data.columns = ['id', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion ', 'Single Epithelial Cell Size',
                              'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

df_data['Bare Nuclei'] = df_data['Bare Nuclei'].replace('?',0).astype(int)
df_data['Class'] = df_data['Class'].replace({2:0, 4:1})

# Define Train Code

In [11]:
def cross_valid(
    X: np.array, y: np.array,
    model=XGBClassifier(),
    cv=StratifiedKFold(n_splits=5, shuffle = True, random_state=42),
    scoring=['accuracy', 'f1', 'recall', 'precision'],
    **kwargs,
):
    # model.fit(X_features, y_label) # !! cross_validate 에서 train 동작이 있으므로 지워야 하는 코드 !!
    cv_result = cross_validate(model, X, y, cv=StratifiedKFold(n_splits=5, shuffle = True, random_state=42), scoring=scoring, **kwargs)
    for score_name in cv_result:
        if 'test' in score_name:
            test_score_mean, test_score_std = np.mean(cv_result[score_name]), np.std(cv_result[score_name])
            print(f'{score_name}: {test_score_mean:.4f} ± {test_score_std:.4f}') # 유효숫자 소수점 아래 4 자리까지 표시


# Define Missing Value Setting function

In [12]:
from typing import Tuple


train_col = ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion ',
             'Single Epithelial Cell Size','Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

def set_missing_value(df: pd.DataFrame, ratio: float) -> Tuple[np.array, np.array]:

    missing_length = int(len(df) * ratio)

    print(f'{missing_length=}')
    
    df = df.copy()
    df.loc[:missing_length-1, train_col] = np.nan
    df = df.fillna(0)

    # df_data[train_col]을 array 형태로 변경
    X = df[train_col].to_numpy()
    # df_data['Class'] : 판다스 series type을 array형태로 변경
    y = df['Class'].to_numpy()

    # X: features matrix, y: label vector
    return X, y

# xgb classifier을 이용하여 결측치가 없을 때 성능 예측

In [13]:
X, y = set_missing_value(df_data, 0)
print(f'{X.shape=}', f'{y.shape=}')
cross_valid(X, y)

missing_length=0
X.shape=(698, 9) y.shape=(698,)
test_accuracy: 0.9613 ± 0.0155
test_f1: 0.9447 ± 0.0221
test_recall: 0.9583 ± 0.0228
test_precision: 0.9315 ± 0.0240


# xgb classifier을 이용하여 결측치 20% -> zero imputation

In [14]:
X, y = set_missing_value(df_data, 0.2)
cross_valid(X, y)

missing_length=139
test_accuracy: 0.8897 ± 0.0200
test_f1: 0.8166 ± 0.0399
test_recall: 0.7178 ± 0.0571
test_precision: 0.9501 ± 0.0122


# xgb classifier을 이용하여 결측치 40% -> zero imputation

In [18]:
X, y = set_missing_value(df_data, 0.4)
cross_valid(X, y)

missing_length=279
test_accuracy: 0.8066 ± 0.0273
test_f1: 0.6175 ± 0.0742
test_recall: 0.4610 ± 0.0854
test_precision: 0.9592 ± 0.0225


# xgb classifier을 이용하여 결측치 60% -> zero imputation

In [16]:
X, y = set_missing_value(df_data, 0.6)
cross_valid(X, y)

missing_length=418
test_accuracy: 0.7393 ± 0.0249
test_f1: 0.4043 ± 0.0895
test_recall: 0.2618 ± 0.0714
test_precision: 0.9462 ± 0.0675


# xgb classifier을 이용하여 결측치 80% -> zero imputation

In [17]:
X, y = set_missing_value(df_data, 0.8)
cross_valid(X, y)

missing_length=558
test_accuracy: 0.7006 ± 0.0187
test_f1: 0.2425 ± 0.0772
test_recall: 0.1413 ± 0.0504
test_precision: 0.9418 ± 0.0792
