In [5]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi'] = 400

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

In [6]:
df = pd.read_csv('preprocessed/data_preprocessed_over.csv') # over_sampling

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773887 entries, 0 to 2773886
Data columns (total 19 columns):
 #   Column    Dtype  
---  ------    -----  
 0   성별코드      float64
 1   연령대코드     float64
 2   허리둘레      float64
 3   총콜레스테롤    float64
 4   트리글리세라이드  float64
 5   HDL콜레스테롤  float64
 6   LDL콜레스테롤  float64
 7   혈색소       float64
 8   요단백       float64
 9   혈청크레아티닌   float64
 10  AST       float64
 11  ALT       float64
 12  감마지티피     float64
 13  흡연상태      float64
 14  음주여부      float64
 15  복부비만      float64
 16  비만여부      float64
 17  고혈압       float64
 18  식전혈당      int64  
dtypes: float64(18), int64(1)
memory usage: 402.1 MB


In [8]:
print(df['식전혈당'].value_counts())

0    924629
1    924629
2    924629
Name: 식전혈당, dtype: int64


In [9]:
df = df[['성별코드', '연령대코드', '허리둘레', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤',
       'LDL콜레스테롤', '혈색소', '요단백', '혈청크레아티닌', 'AST', 'ALT', '감마지티피', '흡연상태',
       '음주여부', '복부비만', '비만여부', '고혈압', '식전혈당']]

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [11]:
features_response = df.columns.tolist()

In [12]:
features_response

['성별코드',
 '연령대코드',
 '허리둘레',
 '총콜레스테롤',
 '트리글리세라이드',
 'HDL콜레스테롤',
 'LDL콜레스테롤',
 '혈색소',
 '요단백',
 '혈청크레아티닌',
 'AST',
 'ALT',
 '감마지티피',
 '흡연상태',
 '음주여부',
 '복부비만',
 '비만여부',
 '고혈압',
 '식전혈당']

In [13]:
X = df[features_response].iloc[:,:-1].values
y = df[features_response].iloc[:,-1].values
print(X.shape, y.shape)

(2773887, 18) (2773887,)


In [14]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [15]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectPercentile

[f_stat, f_p_value] = f_classif(X,y)
f_test_df = pd.DataFrame({'Feature':features_response[:-1],
                         'F statistic':f_stat,
                         'p value':f_p_value})
f_test_df.sort_values('p value')

Unnamed: 0,Feature,F statistic,p value
0,성별코드,29807.672559,0.0
15,복부비만,52555.827252,0.0
13,흡연상태,14309.326046,0.0
12,감마지티피,50019.966799,0.0
11,ALT,43839.284557,0.0
10,AST,23878.26772,0.0
9,혈청크레아티닌,1903.034666,0.0
16,비만여부,48180.20354,0.0
8,요단백,25839.218522,0.0
6,LDL콜레스테롤,19258.585212,0.0


In [17]:
from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile(f_classif, percentile=25)
selector.fit(X,y)
best_feature_ix = selector.get_support()
best_feature_ix

array([False,  True,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False,  True])

In [19]:
features = features_response[:-1]

In [20]:
best_features = [features[counter] for counter in range(len(features))
                if best_feature_ix[counter]]

In [21]:
best_features

['연령대코드', '허리둘레', '트리글리세라이드', 'HDL콜레스테롤', '고혈압']

In [22]:
diabetes_rate = df['식전혈당'].mean()
diabetes_rate

1.0

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X = df[best_features].values
y = df['식전혈당'].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, stratify=y)

In [26]:
from collections import Counter

In [27]:
Counter(y)

Counter({1: 924629, 0: 924629, 2: 924629})

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=1, class_weight='balanced', random_state=24, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [29]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.47129875529641924
테스트결과:  0.47099233999353973


In [30]:
from sklearn.metrics import classification_report
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

           0       0.51      0.59      0.54    693472
           1       0.37      0.19      0.25    693472
           2       0.48      0.64      0.55    693471

    accuracy                           0.47   2080415
   macro avg       0.45      0.47      0.45   2080415
weighted avg       0.45      0.47      0.45   2080415



In [31]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

           0       0.51      0.59      0.54    231157
           1       0.37      0.19      0.25    231157
           2       0.48      0.64      0.54    231158

    accuracy                           0.47    693472
   macro avg       0.45      0.47      0.45    693472
weighted avg       0.45      0.47      0.45    693472



데이터 정규분포 스케일링 적용

In [32]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(X)

In [33]:
X_train , X_test, y_train , y_test = train_test_split(data_scaled, y, test_size=0.25, random_state=0, stratify=y)

In [34]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=0.1, class_weight='balanced', random_state=0, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [35]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.4905117488578
테스트결과:  0.48919783351022106


In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

In [37]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

           0       0.53      0.61      0.57    693471
           1       0.38      0.22      0.28    693472
           2       0.50      0.64      0.56    693472

    accuracy                           0.49   2080415
   macro avg       0.47      0.49      0.47   2080415
weighted avg       0.47      0.49      0.47   2080415



In [38]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

           0       0.53      0.61      0.57    231158
           1       0.38      0.22      0.28    231157
           2       0.50      0.63      0.56    231157

    accuracy                           0.49    693472
   macro avg       0.47      0.49      0.47    693472
weighted avg       0.47      0.49      0.47    693472



In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print('오차행렬:\n', confusion)

In [40]:
get_clf_eval(y_test, pred_test)

오차행렬:
 [[141456  37146  52556]
 [ 87338  51316  92503]
 [ 38346  46338 146473]]
