In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import initializers
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
import seaborn as sns

# Proccessing
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# NN 
from keras import regularizers
from keras.callbacks import LearningRateScheduler
from keras.models import Sequential
from tensorflow.keras import *
from keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.layers as tfl
import tensorflow as tf

# Classifier
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# CV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Features Selections
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
trainset = pd.read_csv('/kaggle/input/playground-series-s3e24/train.csv')
testset = pd.read_csv('/kaggle/input/playground-series-s3e24/test.csv')
origin = pd.read_csv('/kaggle/input/smoker-status-prediction-using-biosignals/train_dataset.csv')

test_idx = testset['id']

trainset.drop('id',axis=1,inplace=True)
testset.drop('id', axis=1, inplace=True)

total = pd.concat([trainset, origin], ignore_index=True)
total.drop_duplicates(inplace=True)

In [None]:
# IsolationForest로 이상치 제거
model = IsolationForest(
    n_estimators=100, max_samples="auto", n_jobs=-1, contamination=0.01
)
model.fit(total.to_numpy())

score = model.decision_function(total.to_numpy())
anomaly = model.predict(total.to_numpy())
total["scores"] = score
total["anomaly"] = anomaly

total = total[total["anomaly"] != -1]

In [None]:
target = total['smoking']

total.drop('smoking', axis = 1, inplace=True)
total.drop('scores', axis = 1, inplace=True)
total.drop('anomaly', axis = 1, inplace=True)

In [None]:
total.head()

In [None]:
sns.countplot(data=trainset,x=target);

In [None]:
sns.countplot(data=total,x=target);

# **EDA**

In [None]:
trainset.describe().T\
    .style.bar(subset=['mean'], color=px.colors.qualitative.G10[2])\
    .background_gradient(subset=['std'], cmap='Blues')\
    .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
total.describe().T\
    .style.bar(subset=['mean'], color=px.colors.qualitative.G10[2])\
    .background_gradient(subset=['std'], cmap='Blues')\
    .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
def summary(df):
    sum = pd.DataFrame(df.dtypes, columns=['dtypes'])
    sum['missing#'] = df.isna().sum()
    sum['missing%'] = (df.isna().sum())/len(df)
    sum['uniques'] = df.nunique().values
    sum['count'] = df.count().values
    return sum

summary(total).style.background_gradient(cmap='Blues')

**Some Info:**

hearing(left) / hearing(right)/dental caries -- 2 uniques

Urine protein -- 6 uniques

In [None]:
summary(testset).style.background_gradient(cmap='Blues')

# **Correlation**

In [None]:
num_var = [column for column in total.columns if total[column].nunique() > 10]
ohe_var = [column for column in total.columns if total[column].nunique() == 6]
corr_matrix = total[num_var].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='Blues', fmt='.2f', linewidths=1, square=True, annot_kws={"size": 9} )
plt.title('Correlation Matrix', fontsize=15)
plt.show()

**Some Info:**

waist(cm) vs weight(kg)

relaxation vs systolic

LDL vs Cholesterol

In [None]:
fig, axes = plt.subplots(3, figsize = (20,12))

sns.scatterplot(ax = axes[0], data = total, x = 'waist(cm)', y = 'weight(kg)', hue = target)
sns.scatterplot(ax = axes[1], data = total, x = 'relaxation', y = 'systolic', hue = target)
sns.scatterplot(ax = axes[2], data = total, x = 'LDL', y = 'Cholesterol', hue = target)

# **Feature Processing**

In [None]:
def preprocessing(df,ohe_cols):
    
    # OneHot Encoding for category cols
    df = pd.get_dummies(df, columns = ohe_cols)
    
    cols_with_nan = df.columns[df.isna().any()].tolist()
    for feature in cols_with_nan:
        df[feature].fillna(df[feature].mode()[0], inplace=True)

    return df    

total = preprocessing(total, ohe_var)
testset = preprocessing(testset, ohe_var)
total.head()

In [None]:
X = total
Y = target

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# **MOdel**

In [None]:
def caculate_f1(y_true, y_pred,model_name):
    val = f1_score(y_true, y_pred, average = 'micro')
    print( f'==>{model_name} Score is ==>', val) 

def get_submission(prob_list,model_name):
       
    submission = pd.DataFrame({'id': test_idx,'defects': prob_list})
    submission.to_csv(f'{model_name}_submission.csv',index=False)
    print(f'Result:{model_name}_submission is saved!')

    return submission

In [None]:
params_xgb = {'n_jobs': -1,
            'eval_metric': 'logloss',
            'objective': 'binary:logistic',
            'tree_method': 'hist',
            'verbosity': 0,
            'random_state': 42,}
              #'early_stopping_rounds':200}

params_lgbm = {'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
            'random_state': 42,
            'device': "cpu"}

params_cb = {'grow_policy': 'Depthwise',
            'bootstrap_type': 'Bayesian',
            'od_type': 'Iter',
            'eval_metric': 'AUC',
            'loss_function': 'Logloss',
            'random_state': 42,
            'task_type': "cpu".upper(),}

In [None]:
df_pred = testset
print(df_pred.shape)

xgb_preds = list()
lgbm_preds = list()
cat_preds = list()

xgb_md = XGBClassifier(**params_xgb)
xgb_md.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=10000)
xgb_pred = xgb_md.predict(X_train)
xgb_pred2 = xgb_md.predict(X_test)

caculate_f1(Y_test, xgb_pred2, 'xgb')
caculate_f1(Y_train, xgb_pred, 'xgb')
    
xgb_pred_test =  pd.Series(xgb_md.predict_proba(df_pred)[:, 1])
xgb_preds.append(xgb_pred_test)

lgbm_md = LGBMClassifier(**params_lgbm).fit(X_train, Y_train)
lgbm_pred2 = lgbm_md.predict(X_train)
lgbm_pred = lgbm_md.predict(X_test)

caculate_f1(Y_test, lgbm_pred, 'lgbm')  
caculate_f1(Y_train, lgbm_pred2, 'lgbm')  
    
lgbm_pred_test = pd.Series(lgbm_md.predict_proba(df_pred)[:, 1])
lgbm_preds.append(lgbm_pred_test)

cat_md = CatBoostClassifier(**params_cb).fit(X_train, Y_train)
cat_pred = cat_md.predict(X_test)
cat_pred2 = cat_md.predict(X_train)

caculate_f1(Y_test, cat_pred, 'cat_m')
caculate_f1(Y_train, cat_pred2, 'cat_m')

cat_pred_test = pd.Series(cat_md.predict_proba(df_pred)[:, 1])
cat_preds.append(cat_pred_test)

eclf = VotingClassifier(
    estimators=[('lgbm', lgbm_md), ('cat', cat_md), ('xgb', xgb_md)],
    voting='soft'
)
eclf.fit(X, Y)
submission = get_submission(pd.Series(eclf.predict_proba(df_pred)[:, 1]), 'eclf')