In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi'] = 400

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

In [2]:
df = pd.read_csv('preprocessed/data_preprocessed_under.csv') # over_sampling

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 703011 entries, 0 to 703010
Data columns (total 19 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   성별코드      703011 non-null  int64  
 1   연령대코드     703011 non-null  int64  
 2   허리둘레      703011 non-null  float64
 3   식전혈당      703011 non-null  float64
 4   총콜레스테롤    703011 non-null  float64
 5   트리글리세라이드  703011 non-null  float64
 6   HDL콜레스테롤  703011 non-null  float64
 7   LDL콜레스테롤  703011 non-null  float64
 8   혈색소       703011 non-null  float64
 9   요단백       703011 non-null  float64
 10  혈청크레아티닌   703011 non-null  float64
 11  AST       703011 non-null  float64
 12  ALT       703011 non-null  float64
 13  감마지티피     703011 non-null  float64
 14  흡연상태      703011 non-null  float64
 15  음주여부      703011 non-null  float64
 16  복부비만      703011 non-null  float64
 17  비만여부      703011 non-null  float64
 18  고혈압       703011 non-null  float64
dtypes: float64(17), int64(2)
memory usage: 101.9

In [4]:
print(df['식전혈당'].value_counts())

0.0    234337
2.0    234337
1.0    234337
Name: 식전혈당, dtype: int64


In [5]:
df = df[['성별코드', '연령대코드', '허리둘레', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤',
       'LDL콜레스테롤', '혈색소', '요단백', '혈청크레아티닌', 'AST', 'ALT', '감마지티피', '흡연상태',
       '음주여부', '복부비만', '비만여부', '고혈압', '식전혈당']]

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [7]:
features_response = df.columns.tolist()

In [8]:
features_response

['성별코드',
 '연령대코드',
 '허리둘레',
 '총콜레스테롤',
 '트리글리세라이드',
 'HDL콜레스테롤',
 'LDL콜레스테롤',
 '혈색소',
 '요단백',
 '혈청크레아티닌',
 'AST',
 'ALT',
 '감마지티피',
 '흡연상태',
 '음주여부',
 '복부비만',
 '비만여부',
 '고혈압',
 '식전혈당']

In [9]:
X = df[features_response].iloc[:,:-1].values
y = df[features_response].iloc[:,-1].values
print(X.shape, y.shape)

(703011, 18) (703011,)


In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [11]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectPercentile

[f_stat, f_p_value] = f_classif(X,y)
f_test_df = pd.DataFrame({'Feature':features_response[:-1],
                         'F statistic':f_stat,
                         'p value':f_p_value})
f_test_df.sort_values('p value')

Unnamed: 0,Feature,F statistic,p value
0,성별코드,7156.208687,0.0
15,복부비만,13891.282255,0.0
13,흡연상태,3368.23301,0.0
12,감마지티피,12568.385173,0.0
11,ALT,11067.988048,0.0
10,AST,6264.807598,0.0
9,혈청크레아티닌,1068.696658,0.0
16,비만여부,12089.354734,0.0
8,요단백,6556.484561,0.0
6,LDL콜레스테롤,4882.216276,0.0


In [12]:
from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile(f_classif, percentile=25)
selector.fit(X,y)
best_feature_ix = selector.get_support()
best_feature_ix

array([False,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True])

In [13]:
features = features_response[:-1]

In [14]:
best_features = [features[counter] for counter in range(len(features))
                if best_feature_ix[counter]]

In [15]:
best_features

['연령대코드', '허리둘레', '트리글리세라이드', '복부비만', '고혈압']

In [16]:
diabetes_rate = df['식전혈당'].mean()
diabetes_rate

1.0

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = df[best_features].values
y = df['식전혈당'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, stratify=y)

In [20]:
from collections import Counter

In [21]:
Counter(y)

Counter({1.0: 234337, 2.0: 234337, 0.0: 234337})

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=1, class_weight='balanced', random_state=24, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [23]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.46102856665996533
테스트결과:  0.46019129118706364


In [24]:
from sklearn.metrics import classification_report
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

         0.0       0.49      0.61      0.54    175753
         1.0       0.37      0.21      0.27    175753
         2.0       0.48      0.56      0.52    175752

    accuracy                           0.46    527258
   macro avg       0.45      0.46      0.44    527258
weighted avg       0.45      0.46      0.44    527258



In [25]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

         0.0       0.49      0.60      0.54     58584
         1.0       0.37      0.22      0.27     58584
         2.0       0.47      0.56      0.51     58585

    accuracy                           0.46    175753
   macro avg       0.44      0.46      0.44    175753
weighted avg       0.44      0.46      0.44    175753



데이터 정규분포 스케일링 적용

In [26]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(X)

In [27]:
X_train , X_test, y_train , y_test = train_test_split(data_scaled, y, test_size=0.25, random_state=0, stratify=y)

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    C=0.1, class_weight='balanced', random_state=0, multi_class='multinomial',
    n_jobs=-1, solver='lbfgs').fit(X_train, y_train)

In [29]:
print('학습결과: ', model.score(X_train, y_train))
print('테스트결과: ', model.score(X_test, y_test))

학습결과:  0.48677497543896914
테스트결과:  0.48542556883808524


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

In [31]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print('classification_repot - 학습데이터')
print(classification_report(y_train, pred_train))

classification_repot - 학습데이터
              precision    recall  f1-score   support

         0.0       0.53      0.61      0.56    175752
         1.0       0.38      0.23      0.29    175753
         2.0       0.50      0.62      0.55    175753

    accuracy                           0.49    527258
   macro avg       0.47      0.49      0.47    527258
weighted avg       0.47      0.49      0.47    527258



In [32]:
print('classification_report - 테스트데이터')
print(classification_report(y_test, pred_test))

classification_report - 테스트데이터
              precision    recall  f1-score   support

         0.0       0.53      0.61      0.56     58585
         1.0       0.38      0.23      0.29     58584
         2.0       0.50      0.62      0.55     58584

    accuracy                           0.49    175753
   macro avg       0.47      0.49      0.47    175753
weighted avg       0.47      0.49      0.47    175753



In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print('오차행렬:\n', confusion)

In [37]:
get_clf_eval(y_test, pred_test)

오차행렬:
 [[35453  9718 13414]
 [21470 13397 23717]
 [10302 11817 36465]]
