In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from scipy.stats import boxcox

import joblib

SEED = 578

# playground & original concat

In [3]:
# original_train_df = pd.read_csv("../data/original_train.csv")
# playground_train_df = pd.read_csv("../data/playground_train.csv").drop(columns=['id'])
# concat_train_df = pd.concat([original_train_df, playground_train_df])
# concat_train_df.to_csv('../data/concat_train.csv')

# original_test_df = pd.read_csv("../data/original_test.csv")
# playground_test_df = pd.read_csv("../data/playground_test.csv").drop(columns=['id'])
# concat_test_df = pd.concat([original_test_df, playground_test_df])
# concat_test_df.to_csv('../data/concat_test.csv')

# load data

In [6]:
concat_train_df = pd.read_csv("../data/concat_train.csv", index_col=0)
concat_test_df = pd.read_csv("../data/concat_test.csv", index_col=0)

# add features

In [None]:
concat_train_df.columns

In [26]:
def add_features(df):
    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)
    return df

concat_train_df = add_features(concat_train_df)
concat_test_df = add_features(concat_test_df)

# categorical features one-hot encoding

In [41]:
categorical_features = ['hearing(left)','hearing(right)','Urine protein','dental caries']

concat_train_df = pd.get_dummies(concat_train_df, columns=categorical_features)
concat_test_df = pd.get_dummies(concat_test_df, columns=categorical_features)

# continuous features standarization

In [40]:
continuous_features = [feature for feature in concat_train_df.columns if feature not in categorical_features and feature != 'smoking']

def standarize_features(df, features, method):
    if method == 'zscore':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'robust':
        scaler = RobustScaler()

    df[features] = scaler.fit_transform(df[features])
    return df

concat_train_df = standarize_features(concat_train_df, continuous_features, 'minmax')
concat_test_df = standarize_features(concat_test_df, continuous_features, 'minmax')

['age',
 'height(cm)',
 'weight(kg)',
 'waist(cm)',
 'eyesight(left)',
 'eyesight(right)',
 'systolic',
 'relaxation',
 'fasting blood sugar',
 'Cholesterol',
 'triglyceride',
 'HDL',
 'LDL',
 'hemoglobin',
 'serum creatinine',
 'AST',
 'ALT',
 'Gtp',
 'BMI']

# handle outliers

In [None]:
def handle_outliers(df, col, method):
    percentile_25 = df[col].quantile(0.25)
    percentile_75 = df[col].quantile(0.75)

    iqr = percentile_75 - percentile_25

    upper_limit = percentile_75 + 1.5 * iqr
    lower_limit = percentile_25 - 1.5 * iqr

    if method == 'replace':
        df.loc[df[col] > upper_limit, col] = upper_limit
        df.loc[df[col] < lower_limit, col] = lower_limit
    elif method == 'remove':
        df = df[(df[col] >= lower_limit) & (df[col] <= upper_limit)]

    return df

for col in continuous_features:
    concat_train_df = handle_outliers(concat_train_df, col, 'replace')

# handle skewness

In [None]:
def handle_skewness(df, col, method):
    if method == 'log':
        df[col] = np.log1p(df[col])
    elif method == 'boxcox':
        df[col], _ = boxcox(df[col])
        
    return df

for col in continuous_features:
    concat_train_df = handle_skewness(concat_train_df, col, 'log')

# save data

In [None]:
concat_train_df.to_csv('../data/preprocess_train.csv')
concat_test_df.to_csv('../data/preprocess_test.csv')

# save folds

In [5]:
def save_stratifiedkfolds(df, n_splits):
    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED).split(df, df['smoking']))
    joblib.dump(folds, f'../data/fold/{n_splits}_stratifiedkfolds.jl')

save_stratifiedkfolds(concat_train_df, 5)