In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [2]:
train = pd.read_csv('./_data/train.csv', index_col=0)#id를 인덱스로
test = pd.read_csv('./_data/test.csv', index_col=0)

In [3]:
from pycaret.classification import * 

* null이 포함되어있는 feature를 nan으로

In [4]:
# thal = 0 = Null 
train[train['thal']==0]

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
130,53,0,2,128,216,0,0,115,0,0.0,2,0,0,1


In [5]:
test[test['thal']==0]

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
47,52,1,0,128,204,1,1,156,1,1.0,1,0,0


In [6]:
train.loc[train['thal']==0,'thal'] = np.nan
test.loc[test['thal']==0,'thal'] = np.nan

In [7]:
# ca = 4 = Null
train[train['ca']==4]

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


In [8]:
test[test['ca']==4]

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
41,52,1,2,138,223,0,1,169,0,0.0,2,4,2.0
46,43,1,0,132,247,1,0,143,1,0.1,1,4,3.0
80,58,1,1,125,220,0,1,144,0,0.4,1,4,3.0
81,38,1,2,138,175,0,1,173,0,0.0,2,4,2.0
96,38,1,2,138,175,0,1,173,0,0.0,2,4,2.0


In [9]:
test.loc[test['ca']==4,'ca'] = np.nan

In [10]:
print(train['thal'].isna().sum())
print(test['thal'].isna().sum())

1
1


In [11]:
print(train['ca'].isna().sum())
print(test['ca'].isna().sum())

0
5


# 모델 만들기

#### 사용할 데이터
* categorical_features : sex, cp, (exang), slope, ca, thal, restecg
* numeric_features : age, (trestbps), (thalach), (fbs), (oldpeak), (chol)

In [12]:
SEED = 32

In [13]:
# ignore_features = []
# categorical_features = ['sex', 'cp', 'slope', 'ca','exang', 'restecg', 'thal']
# numeric_features = ['age', 'trestbps', 'thalach', 'fbs','oldpeak', 'chol']

In [15]:
ignore_features = ['fbs','oldpeak', 'chol', 'restecg','trestbps', 'thalach']
categorical_features = ['sex', 'cp', 'slope', 'ca', 'thal','exang']
numeric_features = ['age']

In [18]:
clf = setup(data=train, 
            target='target', 
            ignore_features=ignore_features,           
            categorical_features=categorical_features, 
            numeric_features=numeric_features,
            remove_outliers=True,
            #outliers_threshold=0.1,
            normalize=True,        
            normalize_method='zscore',    #minmax, zscore, maxabs, robust
            feature_selection=True,
            #feature_selection_threshold=0.9,
            imputation_type='iterative',
            categorical_imputation='mode',
            iterative_imputation_iters=10,            
            categorical_iterative_imputer='xgboost',
            trigonometry_features=True,
            session_id=SEED, 
            silent=True
            #profile=True
            ) 

Unnamed: 0,Description,Value
0,session_id,32
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(151, 14)"
5,Missing Values,True
6,Numeric Features,1
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
best_models = compare_models(sort='f1', n_select=3, fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8,1.0,1.0,0.7143,0.8333,0.6,0.6547
1,0.9,1.0,1.0,0.8333,0.9091,0.8,0.8165
2,0.6,0.76,0.4,0.6667,0.5,0.2,0.2182
3,0.9,1.0,0.8,1.0,0.8889,0.8,0.8165
4,0.9,1.0,0.8,1.0,0.8889,0.8,0.8165
5,0.9,1.0,0.8333,1.0,0.9091,0.8,0.8165
6,0.6,0.875,0.8333,0.625,0.7143,0.0909,0.1021
7,0.9,1.0,0.8333,1.0,0.9091,0.8,0.8165
8,0.8,0.9167,0.8333,0.8333,0.8333,0.5833,0.5833
9,0.8889,1.0,1.0,0.8333,0.9091,0.7692,0.7906


In [None]:
tuned_top3 = [tune_model(i) for i in best_models]
blender = blend_models(tuned_top3)
stacker = stack_models(tuned_top3)
best_auc_model = automl(optimize = 'F1')
finalized_model = finalize_model(best_auc_model)

In [20]:
predictions = predict_model(data=test, estimator=finalized_model)
predictions[predictions['Label']==1].count()

age         78
sex         78
cp          78
trestbps    78
chol        78
fbs         78
restecg     78
thalach     78
exang       78
oldpeak     78
slope       78
ca          74
thal        78
Label       78
Score       78
dtype: int64

## SCORE 0.8275862069

In [None]:
submission['target'] = predictions['Label']
submission

In [None]:
submission.to_csv('./submission9.csv')