In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from collections import Counter
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.metrics import roc_auc_score,classification_report, recall_score, f1_score,precision_recall_curve, auc,precision_score

In [2]:
# !pip install imbalanced-learn

In [3]:
df = pd.read_csv('Data_preprocessing(dibev1수정).csv')

In [4]:
df.head()

Unnamed: 0,id,AGE,ALCSTAT,ARTH1,BMI,CHLEV,CIGAREV2,CPLROU,EPHEV,FSBALANC,...,INTIL2W,MRACBPI2,MRACRPI2,REGION,SMKSTAT2,TIRED_1,PREG,HEIGHT(cm),WEIGHT(kg),DIBEV1
0,100011,79.0,0.0,1.0,23.594147,1.0,0.0,1.0,0.0,0,...,0.0,1.0,1.0,3.0,0.0,1.0,1,157.48,58.513368,1
1,100061,37.0,0.0,0.0,32.775484,1.0,1.0,0.0,0.0,0,...,0.0,1.0,1.0,3.0,0.0,0.0,1,180.34,106.59412,0
2,100091,75.0,0.0,1.0,22.273536,1.0,1.0,1.0,0.0,1,...,0.0,7.0,11.0,3.0,0.0,0.0,1,167.64,62.595696,0
3,1000101,39.0,0.0,0.0,23.709925,0.0,0.0,1.0,0.0,0,...,0.0,1.0,1.0,1.0,0.0,1.0,1,180.34,77.11064,0
4,1000131,54.0,1.0,0.0,38.273579,1.0,0.0,1.0,0.0,0,...,0.0,2.0,2.0,3.0,0.0,1.0,1,165.1,104.32616,0


In [5]:
for col in df.columns:
    print(f'{col}의 고유값 개수 {df[col].nunique()}')

id의 고유값 개수 23524
AGE의 고유값 개수 74
ALCSTAT의 고유값 개수 2
ARTH1의 고유값 개수 2
BMI의 고유값 개수 2325
CHLEV의 고유값 개수 2
CIGAREV2의 고유값 개수 2
CPLROU의 고유값 개수 2
EPHEV의 고유값 개수 2
FSBALANC의 고유값 개수 2
GENDER의 고유값 개수 2
HISPAN_I의 고유값 개수 10
HYPEV의 고유값 개수 2
HYPMDEV2의 고유값 개수 2
HYPMED2의 고유값 개수 2
INTIL2W의 고유값 개수 2
MRACBPI2의 고유값 개수 8
MRACRPI2의 고유값 개수 9
REGION의 고유값 개수 4
SMKSTAT2의 고유값 개수 2
TIRED_1의 고유값 개수 2
PREG의 고유값 개수 2
HEIGHT(cm)의 고유값 개수 25
WEIGHT(kg)의 고유값 개수 216
DIBEV1의 고유값 개수 2


# 1. 데이터 오버 샘플링

In [6]:
# 원본 데이터를 학습 / 테스트로 분리 / 9:1
X_tr, X_test, y_tr, y_test = train_test_split(df.iloc[:,:-1],df['DIBEV1'],random_state=123,test_size=0.1,shuffle=True,stratify=df['DIBEV1'])

In [7]:
y_tr = y_tr.astype('int')

In [8]:
# 분할된 데이터 셋 사이즈 확인
X_tr.shape, X_test.shape, y_tr.shape, y_test.shape

((21171, 24), (2353, 24), (21171,), (2353,))

In [9]:
# target 데이터 비율 확인
print(f'학습데이터 {Counter(y_tr)}')
print(f'테스트데이터 {Counter(y_test)}')

학습데이터 Counter({0: 18252, 1: 2919})
테스트데이터 Counter({0: 2029, 1: 324})


## 1-1. 수치형 데이터 스케일링

In [10]:
num_col = ['AGE','BMI','HEIGHT(cm)','WEIGHT(kg)']

In [11]:
scaler = StandardScaler()

for col in num_col:
    X_tr[col] = scaler.fit_transform(X_tr[[col]])
    X_test[col] = scaler.transform(X_test[[col]])

## 1-2. unique 개수가 3개 이상인 컬럼들 onehotencoding

In [12]:
# 데이터프레임 전체 컬럼에서 수치형 컬럼 제외
cols = np.setdiff1d(df.columns,num_col)

# 위 컬럼에서 고유값 개수가 3개 이상인 컬럼만 추출
nom_col = []
for col in cols:
    if df[col].nunique() >= 3:
        nom_col.append(col)

In [13]:
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION', 'id']

In [14]:
# 위 명목형 컬럼들에서 id 컬럼 제외
nom_col.remove('id')
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION']

In [15]:
# 명목형 컬럼들에 대한 dummy 데이터 생성(원핫인코딩)
train_dummies = []
test_dummies = []
for col in nom_col:
    train_dummies.append(pd.get_dummies(X_tr[col],prefix=col,dummy_na=True,dtype='float'))
    test_dummies.append(pd.get_dummies(X_test[col],prefix=col,dummy_na=True,dtype='float'))

In [16]:
train_dummies = pd.concat(train_dummies,axis=1)
test_dummies = pd.concat(test_dummies,axis=1)

In [17]:
train_dummies.shape, test_dummies.shape

((21171, 35), (2353, 35))

In [18]:
# 만약 고유값 개수 차이로 인해 학습셋과 테스트셋의 더미 데이터셋 컬럼 차이가 있다면 컬럼수가 적은 쪽으로 컬럼 재지정

if train_dummies.shape[1] > test_dummies.shape[1]:
    train_dummies = train_dummies[test_dummies.columns]
elif train_dummies.shape[1] == test_dummies.shape[1]:
    pass
else:
    test_dummies = test_dummies[train_dummies.columns]

In [19]:
train_dummies.shape, test_dummies.shape

((21171, 35), (2353, 35))

In [20]:
# 원본의 학습,테스트셋에 더미데이터셋 합친 후 기존 명목형 컬럼 제거
X_tr = pd.concat([X_tr,train_dummies],axis=1).drop(nom_col,axis=1)
X_test = pd.concat([X_test,test_dummies],axis=1).drop(nom_col,axis=1)

In [21]:
X_tr.shape, X_test.shape

((21171, 55), (2353, 55))

In [22]:
X_tr.isnull().sum()

id               0
AGE              0
ALCSTAT          0
ARTH1            0
BMI              0
CHLEV            0
CIGAREV2         0
CPLROU           0
EPHEV            0
FSBALANC         0
GENDER           0
HYPEV            0
HYPMDEV2         0
HYPMED2          0
INTIL2W          0
SMKSTAT2         0
TIRED_1          0
PREG             0
HEIGHT(cm)       0
WEIGHT(kg)       0
HISPAN_I_0.0     0
HISPAN_I_1.0     0
HISPAN_I_2.0     0
HISPAN_I_3.0     0
HISPAN_I_4.0     0
HISPAN_I_5.0     0
HISPAN_I_6.0     0
HISPAN_I_7.0     0
HISPAN_I_8.0     0
HISPAN_I_12.0    0
HISPAN_I_nan     0
MRACBPI2_1.0     0
MRACBPI2_2.0     0
MRACBPI2_3.0     0
MRACBPI2_6.0     0
MRACBPI2_7.0     0
MRACBPI2_12.0    0
MRACBPI2_16.0    0
MRACBPI2_17.0    0
MRACBPI2_nan     0
MRACRPI2_1.0     0
MRACRPI2_2.0     0
MRACRPI2_3.0     0
MRACRPI2_9.0     0
MRACRPI2_10.0    0
MRACRPI2_11.0    0
MRACRPI2_15.0    0
MRACRPI2_16.0    0
MRACRPI2_17.0    0
MRACRPI2_nan     0
REGION_1.0       0
REGION_2.0       0
REGION_3.0  

In [23]:
X_test.isnull().sum()

id               0
AGE              0
ALCSTAT          0
ARTH1            0
BMI              0
CHLEV            0
CIGAREV2         0
CPLROU           0
EPHEV            0
FSBALANC         0
GENDER           0
HYPEV            0
HYPMDEV2         0
HYPMED2          0
INTIL2W          0
SMKSTAT2         0
TIRED_1          0
PREG             0
HEIGHT(cm)       0
WEIGHT(kg)       0
HISPAN_I_0.0     0
HISPAN_I_1.0     0
HISPAN_I_2.0     0
HISPAN_I_3.0     0
HISPAN_I_4.0     0
HISPAN_I_5.0     0
HISPAN_I_6.0     0
HISPAN_I_7.0     0
HISPAN_I_8.0     0
HISPAN_I_12.0    0
HISPAN_I_nan     0
MRACBPI2_1.0     0
MRACBPI2_2.0     0
MRACBPI2_3.0     0
MRACBPI2_6.0     0
MRACBPI2_7.0     0
MRACBPI2_12.0    0
MRACBPI2_16.0    0
MRACBPI2_17.0    0
MRACBPI2_nan     0
MRACRPI2_1.0     0
MRACRPI2_2.0     0
MRACRPI2_3.0     0
MRACRPI2_9.0     0
MRACRPI2_10.0    0
MRACRPI2_11.0    0
MRACRPI2_15.0    0
MRACRPI2_16.0    0
MRACRPI2_17.0    0
MRACRPI2_nan     0
REGION_1.0       0
REGION_2.0       0
REGION_3.0  

## 1-3 오버샘플링 전 학습 및 평가

In [24]:
# 학습 전 학습데이터를 학습/검증으로 분리
X_tr_1, X_val, y_tr_1, y_val = train_test_split(X_tr,y_tr,random_state=123,test_size=0.1,shuffle=True,stratify=y_tr)

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=123, max_depth=5, n_estimators=500,n_jobs=5)
rf.fit(X_tr_1,y_tr_1)

In [26]:
pred_prob = rf.predict_proba(X_val)
pred = rf.predict(X_val)

In [27]:
from sklearn.metrics import roc_auc_score,classification_report
roc_auc_score(y_val, pred_prob[:,1])

0.8158149409593543

In [28]:
Counter(pred)

Counter({0: 2117, 1: 1})

In [29]:
Counter(y_val)

Counter({0: 1826, 1: 292})

In [30]:
print(classification_report(y_val,pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      1826
           1       1.00      0.00      0.01       292

    accuracy                           0.86      2118
   macro avg       0.93      0.50      0.47      2118
weighted avg       0.88      0.86      0.80      2118



1-3-1. 샘플링 없이 학습할 경우 정밀도, f1_score, 정확도 등은 높으나 재현율이 아예 0으로 실제 당뇨병 환자를 당뇨병이 아니라고 오분류 하고 있음

## 1-4 오버샘플링 후 학습

In [31]:
smote = SMOTE()
X_smote,y_smote = smote.fit_resample(X_tr,y_tr)

In [32]:
X_smote.shape, y_smote.shape

((36504, 55), (36504,))

In [33]:
print(f'원본 타겟데이터 비율 {Counter(y_tr)}')
print(f'오버샘플링 타겟데이터 비율 {Counter(y_smote)}')

원본 타겟데이터 비율 Counter({0: 18252, 1: 2919})
오버샘플링 타겟데이터 비율 Counter({0: 18252, 1: 18252})


In [34]:
# 학습 전 학습데이터를 학습/검증으로 분리
X_tr_2, X_val, y_tr_2, y_val = train_test_split(X_smote,y_smote,random_state=123,test_size=0.1,shuffle=True,stratify=y_smote)

In [35]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=123, max_depth=5, n_estimators=500,n_jobs=5)
rf.fit(X_tr_2,y_tr_2)

In [36]:
pred_prob = rf.predict_proba(X_val)
pred = rf.predict(X_val)

In [37]:
roc_auc_score(y_val, pred_prob[:,1])

0.9502433644915902

In [38]:
Counter(pred)

Counter({0: 1782, 1: 1869})

In [39]:
Counter(y_val)

Counter({0: 1826, 1: 1825})

In [40]:
print(classification_report(y_val,pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      1826
           1       0.85      0.87      0.86      1825

    accuracy                           0.86      3651
   macro avg       0.86      0.86      0.86      3651
weighted avg       0.86      0.86      0.86      3651



In [41]:
pred_prob = rf.predict_proba(X_test)
pred = rf.predict(X_test)

In [42]:
roc_auc_score(y_test, pred_prob[:,1])

0.7935483331203719

In [43]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88      2029
           1       0.36      0.56      0.44       324

    accuracy                           0.80      2353
   macro avg       0.64      0.70      0.66      2353
weighted avg       0.85      0.80      0.82      2353



In [44]:
print(f'True data 비율 {Counter(y_test)}')
print(f'예측 data 비율 {Counter(pred)}')      

True data 비율 Counter({0: 2029, 1: 324})
예측 data 비율 Counter({0: 1850, 1: 503})


In [45]:
precision_score(y_test,pred)

0.36182902584493043

In [46]:
recall_score(y_test,pred)

0.5617283950617284

In [47]:
f1_score(y_test,pred)

0.44014510278113667

In [48]:
precision, recall, _ = precision_recall_curve(y_test, pred_prob[:,1])
pr_auc = auc(recall, precision)
pr_auc

0.3597567679603712

## 1-4. 모델 선정 및 하이퍼파라미터 최적화

모델 : RandomForestClassifier / XGBClassifier / XGBRFClassifier / LGBM / LogisticRegressor

하이퍼파라미터 최적화 툴 : Optuna, Pycaret, auto-sklearn

# 2. 데이터 언더샘플링

## 2-1. 데이터셋 언더샘플링 분할

In [49]:
df['DIBEV1'].value_counts()

DIBEV1
0    20281
1     3243
Name: count, dtype: int64

In [50]:
df_0 = df[df['DIBEV1']==0].copy()
df_1 = df[df['DIBEV1']==1].copy()

In [51]:
def data_sampling(X,n_set):
    '''비복원 데이터 샘플링'''
    np.random.seed(123)
    n_sets = n_set  # 만들고자하는 데이터세트 수
    set_size = round(X.shape[0]/n_set)  # 각 세트의 크기
    X_index = X.index  # 데이터프레임의 인덱스

    data_set = []  # X 데이터의 여러 세트를 저장할 리스트

    for _ in range(n_sets - 1):
        sampled_index = np.random.choice(X_index, size=set_size, replace=False)
        X_index = np.setdiff1d(X_index, sampled_index)  # 추출된 인덱스를 제외합니다.
        data_set.append(X.loc[sampled_index])
    
    data_set.append(X.loc[X_index])
    return data_set

In [52]:
data_set = data_sampling(df_0,9)

In [53]:
data_set[0].shape , data_set[-1].shape

((2253, 25), (2257, 25))

In [54]:
for i,data in enumerate(data_set,start=1):
    globals()[f'udf_{i}'] = pd.concat([data,df_1],axis=0)

In [55]:
# 언더샘플링 1~8을 학습용 / 마지막 udf_9를 검증용으로 사용
udf_set = [udf_1,udf_2,udf_3,udf_4,udf_5,udf_6,udf_7,udf_8]
# uff_9 <- 검증용으로 사용

In [56]:
# 각 데이터셋 타겟 비율 확인
for i,data in enumerate(udf_set+[udf_9]):
    print(f'{i+1}번째 데이터셋 타겟 비율 : {Counter(data.iloc[:,-1])}')

1번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
2번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
3번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
4번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
5번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
6번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
7번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
8번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
9번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2257})


## 2-1. 수치형 데이터 스케일링

In [57]:
total_set = pd.concat(udf_set,axis=0)

In [58]:
num_col = ['AGE','BMI','HEIGHT(cm)','WEIGHT(kg)']

In [59]:
for col in num_col:
    ss = StandardScaler()
    ss.fit(total_set[[col]])
    for udf in udf_set:
        udf[col] = ss.transform(udf[[col]])
udf_9[col] = ss.transform(udf_9[[col]])

## 2-2. unique 개수가 3개 이상인 컬럼들 onehotencoding

In [60]:
# 데이터프레임 전체 컬럼에서 수치형 컬럼 제외
cols = np.setdiff1d(df.columns,num_col)

# 위 컬럼에서 고유값 개수가 3개 이상인 컬럼만 추출
nom_col = []
for col in cols:
    if df[col].nunique() >= 3:
        nom_col.append(col)

In [61]:
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION', 'id']

In [62]:
# 위 명목형 컬럼들에서 id 컬럼 제외
nom_col.remove('id')
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION']

In [63]:
# 명목형 컬럼들에 대한 dummy 데이터 생성(원핫인코딩)
train_dummies = []
test_dummies = []
for col in nom_col:
    train_dummies.append(pd.get_dummies(total_set[col],prefix=col,dummy_na=True,dtype='float'))
    test_dummies.append(pd.get_dummies(udf_9[col],prefix=col,dummy_na=True,dtype='float'))

In [64]:
train_dummies = pd.concat(train_dummies,axis=1)
test_dummies = pd.concat(test_dummies,axis=1)

In [65]:
train_dummies.shape, test_dummies.shape

((43968, 35), (5500, 35))

In [66]:
# 만약 고유값 개수 차이로 인해 학습셋과 테스트셋의 더미 데이터셋 컬럼 차이가 있다면 컬럼수가 적은 쪽으로 컬럼 재지정

if train_dummies.shape[1] > test_dummies.shape[1]:
    train_dummies = train_dummies[test_dummies.columns]
elif train_dummies.shape[1] == test_dummies.shape[1]:
    pass
else:
    test_dummies = test_dummies[train_dummies.columns]

In [67]:
train_dummies.shape, test_dummies.shape

((43968, 35), (5500, 35))

In [74]:
udf_1.shape

(5496, 25)

In [69]:
udf_1['DIBEV1'].value_counts()

DIBEV1
1    3243
0    2253
Name: count, dtype: int64

In [70]:
total_set = pd.concat(udf_set,axis=0)

In [75]:
total_set['DIBEV1'][:5497].sum()

3243

In [76]:
# 원본의 학습,테스트셋에 더미데이터셋 합친 후 기존 명목형 컬럼 제거
total_set = pd.concat([total_set,train_dummies],axis=1).drop(nom_col,axis=1)
udf_9 = pd.concat([udf_9,test_dummies],axis=1).drop(nom_col,axis=1)

In [78]:
udf_1.shape, total_set.shape

((5496, 25), (43968, 56))

In [88]:
total_set.iloc[:5496]

Unnamed: 0,id,AGE,ALCSTAT,ARTH1,BMI,CHLEV,CIGAREV2,CPLROU,EPHEV,FSBALANC,...,MRACRPI2_11.0,MRACRPI2_15.0,MRACRPI2_16.0,MRACRPI2_17.0,MRACRPI2_nan,REGION_1.0,REGION_2.0,REGION_3.0,REGION_4.0,REGION_nan
5498,1000123561,-0.750579,0.0,0.0,-0.692189,0.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
18663,1000440471,0.328700,0.0,1.0,0.622534,0.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12513,1000288831,1.578391,0.0,0.0,-1.287723,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20029,1000473161,-1.773053,1.0,0.0,-1.499974,0.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4253,100095441,0.215092,0.0,1.0,-1.608091,0.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23506,1000555271,0.442308,0.0,0.0,0.460561,1.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23507,1000555281,1.010350,1.0,0.0,-0.204676,1.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23509,1000555341,-0.012125,0.0,0.0,-1.309706,1.0,0.0,1.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23513,1000555411,1.010350,0.0,1.0,3.090395,1.0,0.0,1.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [105]:
for i in range(8):
    globals()[f'udf_{i+1}'] = total_set.iloc[5496*i:5496*(i+1)]

In [108]:
udf_set = [udf_1,udf_2,udf_3,udf_4,udf_5,udf_6,udf_7,udf_8]

In [111]:
model_list = []
for i in range(8):
    model_list.append(RandomForestClassifier(random_state=123,max_depth=5,n_estimators=500,n_jobs=5))

In [115]:
for data,model in zip(udf_set,model_list):
    model.fit(data.drop(['id','DIBEV1'],axis=1),data['DIBEV1'])

In [121]:
def ensemble_predictions(models, X):
    predictions = [model.predict_proba(X) for model in models]
    return np.mean(predictions, axis=0)

In [122]:
X_test = udf_9.drop(['id','DIBEV1'],axis=1)
y_test = udf_9['DIBEV1']

In [124]:
prob = ensemble_predictions(model_list,X_test)

In [178]:
prob

array([[0.23379732, 0.76620268],
       [0.55105736, 0.44894264],
       [0.59273364, 0.40726636],
       ...,
       [0.2096523 , 0.7903477 ],
       [0.1577153 , 0.8422847 ],
       [0.23385602, 0.76614398]])

In [152]:
roc_auc_score(y_test,prob[:,1])

0.8019313197123664

In [188]:
for j in range(20):
    j = j*0.01 + 0.5
    print(j)
    result = [0 if i < j else 1 for i in prob[:,1]]
    print(f'''recall:{recall_score(y_test,result)}
precision:{precision_score(y_test,result)}
f1_score:{f1_score(y_test,result)}''')
    print('---'*10)

0.5
recall:0.900709219858156
    precision:0.7214126944924673
    f1_score:0.8011519473395502
------------------------------
0.51
recall:0.8840579710144928
    precision:0.7295165394402036
    f1_score:0.7993865885961244
------------------------------
0.52
recall:0.8711069996916435
    precision:0.737597911227154
    f1_score:0.7988123851265374
------------------------------
0.53
recall:0.8603145235892692
    precision:0.7434052757793765
    f1_score:0.797598627787307
------------------------------
0.54
recall:0.8532223250077089
    precision:0.7494582881906826
    f1_score:0.7979812545061283
------------------------------
0.55
recall:0.8424298489053346
    precision:0.7561583171879325
    f1_score:0.7969661610268378
------------------------------
0.56
recall:0.8307123034227567
    precision:0.7612319864368465
    f1_score:0.7944559127101151
------------------------------
0.5700000000000001
recall:0.8189947579401788
    precision:0.7656385125396368
    f1_score:0.7914183551847437
-----

In [180]:
recall:0.8424298489053346
precision:0.7561583171879325
f1_score:0.7969661610268378

0.8424298489053346

In [181]:
precision_score(y_test,result)

0.7561583171879325

In [182]:
f1_score(y_test,result)

0.7969661610268378

In [177]:
print(classification_report(y_test,result))

              precision    recall  f1-score   support

           0       0.68      0.69      0.68      2257
           1       0.78      0.77      0.78      3243

    accuracy                           0.74      5500
   macro avg       0.73      0.73      0.73      5500
weighted avg       0.74      0.74      0.74      5500

