In [2]:
import pandas as pd
from pathlib import Path
import numpy as np

# データ読込
DATAPATH = Path('../input/playground-series-s3e2')
N_ESTIMATORS = 100_000

train = pd.read_csv(DATAPATH / 'train.csv')
test = pd.read_csv(DATAPATH / 'test.csv')
sample_sub = pd.read_csv(DATAPATH / 'sample_submission.csv')

train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [3]:
test.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,15304,Female,57.0,0,0,Yes,Private,Rural,82.54,33.4,Unknown
1,15305,Male,70.0,1,0,Yes,Private,Urban,72.06,28.5,Unknown
2,15306,Female,5.0,0,0,No,children,Urban,103.72,19.5,Unknown
3,15307,Female,56.0,0,0,Yes,Govt_job,Urban,69.24,41.4,smokes
4,15308,Male,32.0,0,0,Yes,Private,Rural,111.15,30.1,smokes


In [4]:
sample_sub.head()

Unnamed: 0,id,stroke
0,15304,0.041296
1,15305,0.041296
2,15306,0.041296
3,15307,0.041296
4,15308,0.041296


In [5]:
print(train.info())

train.describe().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 15304 non-null  int64  
 1   gender             15304 non-null  object 
 2   age                15304 non-null  float64
 3   hypertension       15304 non-null  int64  
 4   heart_disease      15304 non-null  int64  
 5   ever_married       15304 non-null  object 
 6   work_type          15304 non-null  object 
 7   Residence_type     15304 non-null  object 
 8   avg_glucose_level  15304 non-null  float64
 9   bmi                15304 non-null  float64
 10  smoking_status     15304 non-null  object 
 11  stroke             15304 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 1.4+ MB
None


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,15304.0,7651.5,4418.028595,0.0,3825.75,7651.5,11477.25,15303.0
age,15304.0,41.417708,21.444673,0.08,26.0,43.0,57.0,82.0
hypertension,15304.0,0.049726,0.217384,0.0,0.0,0.0,0.0,1.0
heart_disease,15304.0,0.023327,0.150946,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,15304.0,89.039853,25.476102,55.22,74.9,85.12,96.98,267.6
bmi,15304.0,28.112721,6.722315,10.3,23.5,27.6,32.0,80.1
stroke,15304.0,0.041296,0.198981,0.0,0.0,0.0,0.0,1.0


In [6]:
# label encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns = ['gender','ever_married','work_type','Residence_type','smoking_status']

print("==before==")
print(train.dtypes)
train.head()

for k in columns:
    train[k] = le.fit_transform(train[k])
    test[k] = le.transform(test[k])

print("==after==")
print(train.dtypes)
train.head()

==before==
id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
==after==
id                     int64
gender                 int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,1,28.0,0,0,1,2,1,79.53,31.1,2,0
1,1,1,33.0,0,0,1,2,0,78.44,23.9,1,0
2,2,0,42.0,0,0,1,2,0,103.0,40.3,0,0
3,3,1,56.0,0,0,1,2,1,64.87,28.8,2,0
4,4,0,24.0,0,0,0,2,0,73.36,28.8,2,0


In [7]:
## モデルの学習
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

# 変数 (idと目的変数以外)
features = [              
    'gender',            
    'age',                 
    'hypertension',        
    'heart_disease',       
    'ever_married',        
    'work_type',          
    'Residence_type',      
    'avg_glucose_level',   
    'bmi',                 
    'smoking_status',      
]

# 予測する値
target = 'stroke'

In [8]:
clfs = []
rocs = []

# lgbmのパラメータ
# 設定値はSimple LightGBM Baselineのノートブックを流用
# https://www.kaggle.com/code/soupmonster/simple-lightgbm-baseline
params = {
    'lambda_l1': 1.945,
    'num_leaves': 87,
    'feature_fraction': 0.79,
    'bagging_fraction': 0.93,
    'bagging_freq': 4,
    'min_data_in_leaf': 103,
    'max_depth': 17,
}

print(train)
print("="*32)

# KFoldの設定
kf = KFold(n_splits=16, random_state=0, shuffle=True)

# KFoldの分割回数で繰り返し
#   学習用,テストそれぞれのデータのインデックス
for train_index, val_index in kf.split(train):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]
    
    # 回帰に使う分類器
    clf = LGBMRegressor(
        learning_rate=0.02,        # default = 0.1
        n_estimators=N_ESTIMATORS, # 決定木の数
        metric='rmse', # 評価指標 RMSE(二乗平均平方根誤差)
        **params
    )
    
    clf.fit(
        X_train.values,
        y_train,
        eval_set=[(X_val, y_val)], # 学習時に用いる検証用データ
        callbacks=[lgbm.early_stopping(85, verbose=False)]
    )
    
    clfs.append(clf)
    
    preds = clf.predict(X_val.values)
    roc = roc_auc_score(y_val, preds)
    print(f'roc: {roc}')
    rocs.append(roc)
    
print(f'mean rocs: {np.mean(rocs)}')

          id  gender   age  hypertension  heart_disease  ever_married  \
0          0       1  28.0             0              0             1   
1          1       1  33.0             0              0             1   
2          2       0  42.0             0              0             1   
3          3       1  56.0             0              0             1   
4          4       0  24.0             0              0             0   
...      ...     ...   ...           ...            ...           ...   
15299  15299       0  22.0             0              0             0   
15300  15300       0  46.0             1              0             1   
15301  15301       0  75.0             0              0             1   
15302  15302       1  46.0             0              0             1   
15303  15303       0  14.0             0              0             0   

       work_type  Residence_type  avg_glucose_level   bmi  smoking_status  \
0              2               1              

In [9]:
# アンサンブル 
# batboostモデル
from catboost import CatBoostRegressor

rocs = []
kf = KFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]
    
    # CatBoostは勾配ブースティングの一つ
    clf = CatBoostRegressor(iterations=N_ESTIMATORS, loss_function='RMSE')
    
    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=1000,
        verbose=False
    )
    
    clfs.append(clf)
    
    preds = clf.predict(X_val.values)
    roc = roc_auc_score(y_val, preds)
    print(f'roc: {roc}')
    rocs.append(roc)
    
print(f'mean rocs: {np.mean(rocs)}')

roc: 0.9057606784569242
roc: 0.893446020633751
roc: 0.8863276458124233
roc: 0.8934935460174205
roc: 0.8752664563437926
roc: 0.8974374896506043
roc: 0.8750627321088026
roc: 0.8726467207534958
roc: 0.8870420624151967
roc: 0.8809655172413793
mean rocs: 0.886744886943379


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

models = []
rocs = []

kf = KFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]

    model = XGBClassifier(n_estimators=2023,learning_rate=0.01)
   
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
#                 early_stopping_rounds=100,
                verbose=False)
    
    models.append(model)
    
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)
    rocs.append(roc)
    
    print(f" roc_auc score: {roc_auc}")
    print("-"*50)

print(f'mean rocs: {np.mean(rocs)}')

 roc_auc score: 0.9035474195883753
--------------------------------------------------
 roc_auc score: 0.8679301768607222
--------------------------------------------------
 roc_auc score: 0.8626296420207428
--------------------------------------------------
 roc_auc score: 0.8679924441179557
--------------------------------------------------
 roc_auc score: 0.8713229024556616
--------------------------------------------------
 roc_auc score: 0.8818719986752773
--------------------------------------------------
 roc_auc score: 0.8697305028605842
--------------------------------------------------
 roc_auc score: 0.8514546529924448
--------------------------------------------------
 roc_auc score: 0.8888956193060671
--------------------------------------------------
 roc_auc score: 0.8841379310344828
--------------------------------------------------
mean rocs: 0.8809655172413793


In [11]:
# submission作成

test_preds = []

# 学習済の各モデルで予測
for clf in clfs:
    preds = clf.predict(test[features].values)
    test_preds.append(preds)

for model in models:
    preds = model.predict_proba(test[features])[:,1]
    test_preds.append(preds)

    

# 予測の平均
test_preds = np.stack(test_preds).mean(0)
test_preds

array([0.03034765, 0.17051565, 0.00126504, ..., 0.00112404, 0.00349267,
       0.00160158])

In [12]:
submission = pd.DataFrame(data={"id": test.id, "stroke": test_preds})
submission.head()

Unnamed: 0,id,stroke
0,15304,0.030348
1,15305,0.170516
2,15306,0.001265
3,15307,0.067381
4,15308,0.005523


In [13]:
submission.to_csv("submission.csv", index=False)