In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from imblearn.under_sampling import TomekLinks

import xgboost as xgb

import joblib

SEED = 578
np.random.seed(SEED)

pd.set_option('display.max_columns', None)

# playground & original concat

- playground test ratio: 0.4
- original test ratio: 0.3
- concat test ratio: 0.38 

In [2]:
original_train_df = pd.read_csv("../data/original_train.csv")
original_train_df['original'] = 1
playground_train_df = pd.read_csv("../data/playground_train.csv").drop(columns=['id'])
playground_train_df['original'] = 0
concat_train_df = pd.concat([original_train_df[[col for col in original_train_df.columns if col != 'original']], playground_train_df[[col for col in original_train_df.columns if col != 'original']]]).drop_duplicates().reset_index(drop=True)
concat_train_df.to_csv('../data/concat_train.csv')

original_test_df = pd.read_csv("../data/original_test.csv")
playground_test_df = pd.read_csv("../data/playground_test.csv").drop(columns=['id'])
concat_test_df = pd.concat([original_test_df, playground_test_df]).reset_index(drop=True)
concat_test_df.to_csv('../data/concat_test.csv')

# load data

In [3]:
train_df = pd.read_csv("../data/concat_train.csv", index_col=0)
test_df = pd.read_csv("../data/concat_test.csv", index_col=0)

# add features

## add continuous features

In [4]:
def add_continuous_features(df):
    gender = pd.read_csv("../data/gender.csv", index_col=0)
    X_gender = gender.drop(['gender', 'oral', 'tartar', 'smoking'], axis=1)
    y_gender = gender['gender'].map({'F': 0, 'M': 1})
    folds =  RepeatedStratifiedKFold()
    clfs = cross_validate(xgb.XGBClassifier(random_state=SEED), X_gender, y_gender, scoring='roc_auc', cv=folds, return_estimator=True)
    df['gender'] = 0
    for clf in clfs['estimator']:
        df['gender'] += clf.predict_proba(df[[col for col in df.columns if col not in ['smoking','gender']]])[:, 1]
    df['gender'] /= len(clfs['estimator'])

    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)

    best_hearing = np.where(df['hearing(left)'] < df['hearing(right)'], 
                    df['hearing(left)'],  df['hearing(right)'])
    worst_hearing = np.where(df['hearing(left)'] < df['hearing(right)'], 
                     df['hearing(right)'],  df['hearing(left)'])
    df['hearing(left)'] = best_hearing - 1
    df['hearing(right)'] = worst_hearing - 1
    
    df['eyesight(left)'] = np.where(df['eyesight(left)'] > 9, 0, df['eyesight(left)'])
    df['eyesight(right)'] = np.where(df['eyesight(right)'] > 9, 0, df['eyesight(right)'])
    best_eyesight = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                    df['eyesight(left)'],  df['eyesight(right)'])
    worst_eyesight = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                     df['eyesight(right)'],  df['eyesight(left)'])
    df['eyesight(left)'] = best_eyesight
    df['eyesight(right)'] = worst_eyesight
    
    df['Gtp'] = np.clip(df['Gtp'], 0, 300)
    df['HDL'] = np.clip(df['HDL'], 0, 110)
    df['LDL'] = np.clip(df['LDL'], 0, 200)
    df['ALT'] = np.clip(df['ALT'], 0, 150)
    df['AST'] = np.clip(df['AST'], 0, 100)
    df['serum creatinine'] = np.clip(df['serum creatinine'], 0, 3)  

    return df

train_df = add_continuous_features(train_df)
test_df = add_continuous_features(test_df)

## add categorical features

In [5]:
def add_categorical_features(df):
    df['BMI_category'] = pd.cut(df['BMI'], 
                                 bins=[0, 18.5, 25, 30, 35, 40, float('inf')], 
                                 labels=['underweight', 'normal', 'overweight', 'obese-1', 'obese-2', 'obese-3'],
                                 right=False)

    df['approx_gender'] = np.where(df['gender'] < 0.5, 0, 1)
    df.loc[df['approx_gender'] == 0, 'waist_category'] = pd.cut(df[df['gender'] == 0]['waist(cm)'],
                                                      bins=[0, 80, 88, float('inf')], 
                                                      labels=['low', 'high', 'very high'])
    df.loc[df['approx_gender'] == 1, 'waist_category'] = pd.cut(df[df['gender'] == 1]['waist(cm)'],
                                                      bins=[0, 94, 102, float('inf')], 
                                                      labels=['low', 'high', 'very high'])
    df.drop('approx_gender', axis=1, inplace=True)

    df['age_category'] = df['age'].apply(lambda x: 'high' if x >= 45 else 'low')

    df['BP_category'] = 'normal'
    df.loc[(df['systolic'] >= 120) & (df['systolic'] < 130) | (df['relaxation'] >= 80) & (df['relaxation'] < 89), 'BP_category'] = 'elevated'
    df.loc[(df['systolic'] >= 130) & (df['systolic'] < 140) | (df['relaxation'] >= 90) & (df['relaxation'] < 99), 'BP_category'] = 'high BP stage 1'
    df.loc[(df['systolic'] >= 140) & (df['systolic'] < 180) | (df['relaxation'] >= 100) & (df['relaxation'] < 120), 'BP_category'] = 'high BP stage 2'
    df.loc[(df['systolic'] >= 180) | (df['relaxation'] >= 120), 'BP_category'] = 'high BP stage 3'

    df['Cholesterol_category'] = pd.cut(df['Cholesterol'],
                                     bins=[0, 200, 239, float('inf')],
                                     labels=['desirable', 'borderline high', 'high'],
                                     right=False)

    df['HDL_category'] = pd.cut(df['HDL'],
                             bins=[0, 40, 60, float('inf')],
                             labels=['high', 'normal', 'low'],
                             right=False)

    df['LDL_category'] = pd.cut(df['LDL'],
                             bins=[0, 100, 129, 159, 189, float('inf')],
                             labels=['optimal', 'near optimal', 'borderline high', 'high', 'very high'],
                             right=False)

    df['triglyceride_category'] = pd.cut(df['triglyceride'],
                                      bins=[0, 150, 199, 499, float('inf')],
                                      labels=['normal', 'moderate', 'high', 'very high'],
                                      right=False)

    df['hemoglobin_category'] = df.apply(lambda x: 'high' if (x['hemoglobin'] < 110 if x['age'] < 5 or x['gender'] == 0 else x['hemoglobin'] < 120) else 'normal', axis=1)

    df['serum creatinine_category'] = df.apply(lambda x: 'normal' if (0.74 <= x['serum creatinine'] <= 1.35 if x['gender'] == 1 else 0.59 <= x['serum creatinine'] <= 1.04) else 'abnormal', axis=1)
 
    df['Gtp_category'] = df['Gtp'].apply(lambda x: 'normal' if 5 <= x <= 40 else 'abnormal')

    df['AST_category'] = df['AST'].apply(lambda x: 'normal' if 8 <= x <= 45 else 'abnormal')

    df['ALT_category'] = df['ALT'].apply(lambda x: 'normal' if 8 <= x <= 45 else 'abnormal')

    return df

train_df = add_categorical_features(train_df)
test_df = add_categorical_features(test_df)

## add kmeans features

In [6]:
categorical_features = [
    'hearing(left)',
    'hearing(right)',
    'Urine protein',
    'dental caries',
    'original',
    'BMI_category',
    'waist_category',
    'age_category',
    'BP_category',
    'Cholesterol_category',
    'HDL_category',
    'LDL_category',
    'triglyceride_category',
    'hemoglobin_category',
    'serum creatinine_category',
    'Gtp_category',
    'AST_category',
    'ALT_category'
]
continuous_features = [feature for feature in train_df.columns if feature not in categorical_features and feature != 'smoking']

def add_kmeans_features(df, continuous_features, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(df[continuous_features])
    
    df['kmeans_cluster_label'] = kmeans.labels_

    return df

train_df = add_kmeans_features(train_df, continuous_features, 3)
test_df = add_kmeans_features(test_df, continuous_features, 3)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


# categorical features one-hot encoding

In [7]:
train_df = pd.get_dummies(train_df, columns=[col for col in train_df.columns if col in categorical_features])
test_df = pd.get_dummies(test_df, columns=[col for col in test_df.columns if col in categorical_features])

# continuous features standardization

In [8]:
def fit_scaler(train_df, continuous_features, method):
    if method == 'zscore':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'robust':
        scaler = RobustScaler()

    scaler.fit(train_df[continuous_features])

    return scaler

scaler = fit_scaler(train_df, continuous_features, 'minmax')

train_df[continuous_features] = scaler.transform(train_df[continuous_features])
test_df[continuous_features] = scaler.transform(test_df[continuous_features])

# train → train, valid

In [9]:
train_df, valid_df = train_test_split(train_df, 
                                      test_size=0.4, 
                                      stratify=train_df['smoking'], 
                                      random_state=SEED)

# handle outliers

In [10]:
def handle_outliers(df, col, method):
    percentile_25 = df[col].quantile(0.25)
    percentile_75 = df[col].quantile(0.75)

    iqr = percentile_75 - percentile_25

    upper_limit = percentile_75 + 1.5 * iqr
    lower_limit = percentile_25 - 1.5 * iqr

    if method == 'replace':
        df.loc[df[col] > upper_limit, col] = upper_limit
        df.loc[df[col] < lower_limit, col] = lower_limit
    elif method == 'remove':
        df = df[(df[col] >= lower_limit) & (df[col] <= upper_limit)]

    return df

for col in continuous_features:
    train_df = handle_outliers(train_df, col, 'replace')

# tomek links

In [11]:
# X = train_df.drop('smoking',axis =1)
# y = train_df[['smoking']]
    
# def tomek_links(X,y): 
#     tl = TomekLinks(sampling_strategy='auto')
#     X_resampled, y_resampled = tl.fit_resample(X, y)
    
#     return X_resampled , y_resampled

# X_resampled, y_resampled = tomek_links (X,y)
# train_df = pd.concat([X_resampled,y_resampled], axis=1).reset_index(drop=True)

# summary

In [12]:
def summary(df):
    sum = pd.DataFrame(df.dtypes, columns=['dtypes'])
    sum['missing#'] = df.isna().sum()
    sum['missing%'] = (df.isna().sum())/len(df)
    sum['uniques'] = df.nunique().values
    sum['count'] = df.count().values
    return sum

summary(train_df).style.background_gradient(cmap='Blues')

Unnamed: 0,dtypes,missing#,missing%,uniques,count
age,float64,0,0.0,15,115633
height(cm),float64,0,0.0,12,115633
weight(kg),float64,0,0.0,18,115633
waist(cm),float64,0,0.0,463,115633
eyesight(left),float64,0,0.0,17,115633
eyesight(right),float64,0,0.0,14,115633
systolic,float64,0,0.0,70,115633
relaxation,float64,0,0.0,49,115633
fasting blood sugar,float64,0,0.0,54,115633
Cholesterol,float64,0,0.0,170,115633


# save data

In [13]:
train_df.to_csv('../data/preprocess_train.csv')
valid_df.to_csv('../data/preprocess_valid.csv')
test_df.to_csv('../data/preprocess_concat_test.csv')
test_df[len(original_test_df):].to_csv('../data/preprocess_test.csv')

# save folds

In [14]:
n_splits = 20

In [15]:
train_folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
joblib.dump(train_folds, f'../fold/{n_splits}_train_stratifiedkfolds.jl')

['../fold/20_train_stratifiedkfolds.jl']