In [49]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi'] = 400

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

In [78]:
df_over = pd.read_csv('preprocessed/data_preprocessed_over.csv') # over_sampling
df_under = pd.read_csv('preprocessed/data_preprocessed_under.csv') # under_sampling
df = pd.read_csv('preprocessed/data_preprocessed.csv')

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1959728 entries, 0 to 1959727
Data columns (total 19 columns):
 #   Column    Dtype  
---  ------    -----  
 0   성별코드      int64  
 1   연령대코드     int64  
 2   허리둘레      float64
 3   식전혈당      float64
 4   총콜레스테롤    float64
 5   트리글리세라이드  float64
 6   HDL콜레스테롤  float64
 7   LDL콜레스테롤  float64
 8   혈색소       float64
 9   요단백       float64
 10  혈청크레아티닌   float64
 11  AST       float64
 12  ALT       float64
 13  감마지티피     float64
 14  흡연상태      float64
 15  음주여부      float64
 16  복부비만      float64
 17  비만여부      float64
 18  고혈압       float64
dtypes: float64(17), int64(2)
memory usage: 284.1 MB


In [108]:
print(df['식전혈당'].value_counts())

1.0    924629
0.0    800762
2.0    234337
Name: 식전혈당, dtype: int64


In [152]:
df = df[['성별코드', '연령대코드', '허리둘레', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤',
       'LDL콜레스테롤', '혈색소', '요단백', '혈청크레아티닌', 'AST', 'ALT', '감마지티피', '흡연상태',
       '음주여부', '복부비만', '비만여부', '고혈압', '식전혈당']]

In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [161]:
features_response = df.columns.tolist()

In [162]:
features_response

['성별코드',
 '연령대코드',
 '허리둘레',
 '총콜레스테롤',
 '트리글리세라이드',
 'HDL콜레스테롤',
 'LDL콜레스테롤',
 '혈색소',
 '요단백',
 '혈청크레아티닌',
 'AST',
 'ALT',
 '감마지티피',
 '흡연상태',
 '음주여부',
 '복부비만',
 '비만여부',
 '고혈압',
 '식전혈당']

In [163]:
X = df[features_response].iloc[:,:-1].values
y = df[features_response].iloc[:,-1].values
print(X.shape, y.shape)

(1959728, 18) (1959728,)


In [164]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [165]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectPercentile

[f_stat, f_p_value] = f_classif(X,y)
f_test_df = pd.DataFrame({'Feature':features_response[:-1],
                         'F statistic':f_stat,
                         'p value':f_p_value})
f_test_df.sort_values('p value')

Unnamed: 0,Feature,F statistic,p value
0,성별코드,18316.998555,0.0
15,복부비만,26337.532092,0.0
13,흡연상태,8302.089879,0.0
12,감마지티피,29180.207481,0.0
11,ALT,22609.674985,0.0
10,AST,12537.064122,0.0
9,혈청크레아티닌,2893.691363,0.0
16,비만여부,26999.376762,0.0
8,요단백,12037.704349,0.0
6,LDL콜레스테롤,7653.538764,0.0


In [166]:
from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile(f_classif, percentile=25)
selector.fit(X1,y1)
best_feature_ix = selector.get_support()
best_feature_ix

array([False,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True])

In [167]:
features = features_response1[:-1]

In [168]:
best_features = [features[counter] for counter in range(len(features))
                if best_feature_ix[counter]]

In [169]:
best_features

['연령대코드', '허리둘레', '트리글리세라이드', '복부비만', '고혈압']

In [170]:
diabetes_rate = df['식전혈당'].mean()
diabetes_rate

0.7109675424344603

In [171]:
from sklearn.model_selection import train_test_split

In [175]:
X = df[best_features].values
y = df['식전혈당'].values

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, stratify=y)

In [142]:
from collections import Counter

In [143]:
Counter(y)

Counter({1.0: 924629, 0.0: 800762, 2.0: 234337})

In [176]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=1, class_weight='balanced', random_state=24, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [177]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.41560869671709544
테스트결과:  0.415124956116359


In [178]:
from sklearn.metrics import classification_report
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

         0.0       0.53      0.60      0.56    600571
         1.0       0.53      0.22      0.31    693472
         2.0       0.20      0.56      0.29    175753

    accuracy                           0.42   1469796
   macro avg       0.42      0.46      0.39   1469796
weighted avg       0.49      0.42      0.41   1469796



In [179]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

         0.0       0.53      0.60      0.57    200191
         1.0       0.53      0.22      0.31    231157
         2.0       0.19      0.56      0.29     58584

    accuracy                           0.42    489932
   macro avg       0.42      0.46      0.39    489932
weighted avg       0.49      0.42      0.41    489932



데이터 정규분포 스케일링 적용

In [183]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(X)

In [209]:
X_train , X_test, y_train , y_test = train_test_split(data_scaled, y, test_size=0.25, random_state=0, stratify=y)

In [211]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=0.1, class_weight='balanced', random_state=0, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [212]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.4295126670640007
테스트결과:  0.4294718450723774


In [219]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

In [222]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

         0.0       0.56      0.61      0.58    600571
         1.0       0.54      0.23      0.32    693472
         2.0       0.21      0.62      0.31    175753

    accuracy                           0.43   1469796
   macro avg       0.44      0.49      0.40   1469796
weighted avg       0.51      0.43      0.43   1469796



In [225]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

         0.0       0.56      0.61      0.58    200191
         1.0       0.54      0.23      0.32    231157
         2.0       0.21      0.62      0.31     58584

    accuracy                           0.43    489932
   macro avg       0.44      0.49      0.40    489932
weighted avg       0.51      0.43      0.43    489932



In [226]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print('오차행렬:\n', confusion)

In [227]:
get_clf_eval(y_test, pred_test)

오차행렬:
 [[121606  32952  45633]
 [ 85924  52430  92803]
 [ 10484  11724  36376]]
