In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.decomposition import PCA
from scipy.stats import boxcox

import xgboost as xgb

import joblib

SEED = 578

# playground & original concat

- playground test ratio: 0.4
- original test ratio: 0.3
- concat test ratio: 0.38 

In [2]:
# original_train_df = pd.read_csv("../data/original_train.csv")
# playground_train_df = pd.read_csv("../data/playground_train.csv").drop(columns=['id'])
# concat_train_df = pd.concat([original_train_df, playground_train_df])
# concat_train_df.to_csv('../data/concat_train.csv')

# original_test_df = pd.read_csv("../data/original_test.csv")
# playground_test_df = pd.read_csv("../data/playground_test.csv").drop(columns=['id'])
# concat_test_df = pd.concat([original_test_df, playground_test_df])
# concat_test_df.to_csv('../data/concat_test.csv')

# load data

In [3]:
concat_train_df = pd.read_csv("../data/concat_train.csv", index_col=0)
concat_test_df = pd.read_csv("../data/concat_test.csv", index_col=0)

# add features

In [4]:
def add_gender_feature(df, lower_threshold, upper_threshold):
    gender = pd.read_csv("../data/gender.csv", index_col=0)

    X_gender = gender.drop(['gender', 'oral', 'tartar', 'smoking'], axis=1)
    y_gender = gender['gender'].map({'F': 0, 'M': 1})

    folds =  RepeatedStratifiedKFold()
    clfs = cross_validate(xgb.XGBClassifier(random_state=SEED), X_gender, y_gender, scoring='roc_auc', cv=folds, return_estimator=True)

    df['gender'] = 0
    for clf in clfs['estimator']:
        df['gender'] += clf.predict_proba(df[[col for col in df.columns if col not in ['smoking','gender']]])[:, 1]
    df['gender'] /= len(clfs['estimator'])


    print(len(df), end=' → ')
    df = df[(df['gender']>upper_threshold) | (df['gender']<lower_threshold)].reset_index(drop=True)
    df.loc[df['gender'] > upper_threshold, 'gender'] = 1
    df.loc[df['gender'] < lower_threshold, 'gender'] = 0
    print(len(df))

    return df

concat_train_df = add_gender_feature(concat_train_df, 0.1, 0.9)
concat_test_df = add_gender_feature(concat_test_df, 0.1, 0.9)

198240 → 186681
122879 → 115621


In [5]:
def add_categorical_features(df):
    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)
    df['BMI_category'] = pd.cut(df['BMI'], 
                                 bins=[0, 18.5, 25, 30, 35, 40, float('inf')], 
                                 labels=['underweight', 'normal', 'overweight', 'obese-1', 'obese-2', 'obese-3'],
                                 right=False)

    df.loc[df['gender'] == 0, 'waist_category'] = pd.cut(df[df['gender'] == 0]['waist(cm)'],
                                                      bins=[0, 80, 88, float('inf')], 
                                                      labels=['low', 'high', 'very high'])
    df.loc[df['gender'] == 1, 'waist_category'] = pd.cut(df[df['gender'] == 1]['waist(cm)'],
                                                      bins=[0, 94, 102, float('inf')], 
                                                      labels=['low', 'high', 'very high'])

    df['age_category'] = df['age'].apply(lambda x: 'high' if x >= 45 else 'low')

    df['BP_category'] = 'normal'
    df.loc[(df['systolic'] >= 120) & (df['systolic'] < 130) | (df['relaxation'] >= 80) & (df['relaxation'] < 89), 'BP_category'] = 'elevated'
    df.loc[(df['systolic'] >= 130) & (df['systolic'] < 140) | (df['relaxation'] >= 90) & (df['relaxation'] < 99), 'BP_category'] = 'high BP stage 1'
    df.loc[(df['systolic'] >= 140) & (df['systolic'] < 180) | (df['relaxation'] >= 100) & (df['relaxation'] < 120), 'BP_category'] = 'high BP stage 2'
    df.loc[(df['systolic'] >= 180) | (df['relaxation'] >= 120), 'BP_category'] = 'high BP stage 3'

    df['Cholesterol_category'] = pd.cut(df['Cholesterol'],
                                     bins=[0, 200, 239, float('inf')],
                                     labels=['desirable', 'borderline high', 'high'],
                                     right=False)

    df['HDL_category'] = pd.cut(df['HDL'],
                             bins=[0, 40, 60, float('inf')],
                             labels=['high', 'normal', 'low'],
                             right=False)

    df['LDL_category'] = pd.cut(df['LDL'],
                             bins=[0, 100, 129, 159, 189, float('inf')],
                             labels=['optimal', 'near optimal', 'borderline high', 'high', 'very high'],
                             right=False)

    df['triglyceride_category'] = pd.cut(df['triglyceride'],
                                      bins=[0, 150, 199, 499, float('inf')],
                                      labels=['normal', 'moderate', 'high', 'very high'],
                                      right=False)

    df['hemoglobin_category'] = df.apply(lambda x: 'high' if (x['hemoglobin'] < 110 if x['age'] < 5 or x['gender'] == 0 else x['hemoglobin'] < 120) else 'normal', axis=1)

    df['serum creatinine_category'] = df.apply(lambda x: 'normal' if (0.74 <= x['serum creatinine'] <= 1.35 if x['gender'] == 1 else 0.59 <= x['serum creatinine'] <= 1.04) else 'abnormal', axis=1)
 
    df['Gtp_category'] = df['Gtp'].apply(lambda x: 'normal' if 5 <= x <= 40 else 'abnormal')

    df['AST_category'] = df['AST'].apply(lambda x: 'normal' if 8 <= x <= 45 else 'abnormal')

    df['ALT_category'] = df['ALT'].apply(lambda x: 'normal' if 8 <= x <= 45 else 'abnormal')

    return df

concat_train_df = add_categorical_features(concat_train_df)
concat_test_df = add_categorical_features(concat_test_df)

# categorical features one-hot encoding

In [6]:
categorical_features = [
    'hearing(left)',
    'hearing(right)',
    'Urine protein',
    'dental caries',
    'BMI_category',
    'waist_category',
    'age_category',
    'BP_category',
    'Cholesterol_category',
    'HDL_category',
    'LDL_category',
    'triglyceride_category',
    'hemoglobin_category',
    'serum creatinine_category',
    'Gtp_category',
    'AST_category',
    'ALT_category'
]

concat_train_df = pd.get_dummies(concat_train_df, columns=categorical_features)
concat_test_df = pd.get_dummies(concat_test_df, columns=categorical_features)

# continuous features standarization

In [8]:
continuous_features = [feature for feature in concat_train_df.columns if feature not in categorical_features and feature != 'smoking']

def fit_scaler(train_df, continuous_features, method):
    if method == 'zscore':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'robust':
        scaler = RobustScaler()

    scaler.fit(train_df[continuous_features])

    return scaler

scaler = fit_scaler(concat_train_df, continuous_features, 'minmax')

concat_train_df[continuous_features] = scaler.transform(concat_train_df[continuous_features])
concat_test_df[continuous_features] = scaler.transform(concat_test_df[continuous_features])

# concat train → train, valid

In [9]:
train_df, valid_df = train_test_split(concat_train_df, 
                                      test_size=0.4, 
                                      stratify=concat_train_df['smoking'], 
                                      random_state=SEED)

# handle outliers

In [10]:
def handle_outliers(df, col, method):
    percentile_25 = df[col].quantile(0.25)
    percentile_75 = df[col].quantile(0.75)

    iqr = percentile_75 - percentile_25

    upper_limit = percentile_75 + 1.5 * iqr
    lower_limit = percentile_25 - 1.5 * iqr

    if method == 'replace':
        df.loc[df[col] > upper_limit, col] = upper_limit
        df.loc[df[col] < lower_limit, col] = lower_limit
    elif method == 'remove':
        df = df[(df[col] >= lower_limit) & (df[col] <= upper_limit)]

    return df

for col in continuous_features:
    train_df = handle_outliers(train_df, col, 'replace')

# handle skewness

In [11]:
def handle_skewness(df, col, method):
    if method == 'log':
        df[col] = np.log1p(df[col])
    elif method == 'boxcox':
        df[col], _ = boxcox(df[col])
        
    return df

for col in continuous_features:
    train_df = handle_skewness(train_df, col, 'log')

# save data

In [12]:
train_df.to_csv('../data/preprocess_train.csv')
valid_df.to_csv('../data/preprocess_valid.csv')
concat_test_df.to_csv('../data/preprocess_test.csv')

# save folds

In [13]:
n_splits = 10

In [14]:
train_folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
joblib.dump(train_folds, f'../fold/{n_splits}_train_stratifiedkfolds.jl')

valid_folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED).split(valid_df, valid_df['smoking']))
joblib.dump(valid_folds, f'../fold/{n_splits}_valid_stratifiedkfolds.jl')

['../fold/10_train_stratifiedkfolds.jl']