# Data Preprocessing

In [None]:
# Pandas 및 Numpy 패키지 불러오기
import pandas as pd 
import numpy as np

# 국건영 2016년, 2017년, 2018년도 데이터 불러오기
df16 = pd.read_sas('hn16_all.sas7bdat')
df17 = pd.read_sas('hn17_all.sas7bdat')
df18 = pd.read_sas('hn18_all.sas7bdat')

df_all = pd.concat([df16,df17,df18])

# 선행 연구(논문) 기반 유의미한 변수 추출
df_att = df_all[["sex","age","D_1_1","Total_slp_wk","BP1","BO1","BE5_1","BE8_1","BE8_2","BP5","HE_ht","HE_wt"]]

#히스토그램 산출 ( 탐색적 분석 )
import matplotlib.pyplot as plt
import seaborn as sns
for df in ["sex","age","D_1_1","Total_slp_wd","BP1","BO1","BE5_1","BE8_1","BE8_2","BP5","HE_ht","HE_wt"]:
    plt.hist(df_att[df])
    plt.ylabel('frequency')
    plt.title('Histogram of {}'.format(df))
    plt.show()

# BMI [BMI] (명목) 변수 생성
df_att['BMI'] = (df_att['HE_wt'] / (df_att['HE_ht'] / 100)**2)
df_att.loc[df_att['BMI'] < 25, 'BMI'] = 0
df_att.loc[df_att['BMI'] >= 25, 'BMI'] = 1

# 청소년 데이터 생성
df_csn = df_att.loc[(df_att['age']>=12) & (df_att['age']<=18), :] )

# 하루 평균 앉아서 보내는 시간(분 단위) 변수 생성
df_csn["SitTime"] = df_csn["BE8_1"]*60 + df_csn['BE8_2']

In [None]:
# 청소년 건강행태조사 2018년 데이터 불러오기
df_cgh = pd.read_csv('cgh18.csv')

In [None]:
# 국건영, 청건행 데이터 통합
df_att = pd.concat([df_csn,a])
df_att['BMI'] = df_att['BMI'].astype('category')
df_att.loc[(df_att['BE8_1'] == 88) |(df_att['BE8_1'] == 99) , 'BE8_1'] = np.nan 
df_att.loc[(df_att['BE8_2'] == 88) |(df_att['BE8_2'] == 99) , 'BE8_2'] = np.nan  

In [None]:
# 필요 Attribute만 가져오기
df_att = df_att.loc[:,['sex','age','D_1_1','BP1','BO1','BE5_1','BP5','BMI','Total_slp_wk','SitTime']]

In [None]:
# 1주일간 근력운동 일수 (등간)
print(df_att['BE5_1'].value_counts()) # 8 비해당 929명, 9 모름, 무응답 : 267명 -> NA
df_att.loc[(df_att['BE5_1'] == 8) |(df_att['BE5_1'] == 9) , 'BE5_1'] = np.nan  #na 값 처리

In [None]:
# 주관적 건강상태 [D_1_1]] (순서)
from pandas.api.types import CategoricalDtype
df_att.loc[df_att['D_1_1'] == 9, 'D_1_1'] = np.nan
df_att['D_1_1'] = df_att['D_1_1'].astype(CategoricalDtype(ordered=True))

In [None]:
# 2주 이상 연속 우울감 여부 [BP5] (명목)
df_att.loc[df_att['BP5'] == 8, 'BP5'] = np.nan
df_att.loc[df_att['BP5'] == 9, 'BP5'] = np.nan
df_att['BP5'] = df_att['BP5'].astype('category')

In [None]:
# 평소 스트레스 인지정도 [BP1] (순서)
from pandas.api.types import CategoricalDtype
df_att.loc[df_att['BP1'] == 8, 'BP1'] = np.nan
df_att.loc[df_att['BP1'] == 9, 'BP1'] = np.nan
df_att['BP1'] = df_att['BP1'].astype(CategoricalDtype(ordered=True))

In [None]:
# 주관적 체형 인지 [BO1] (순서)
from pandas.api.types import CategoricalDtype)
df_att.loc[df_att['BO1'] == 8, 'BO1'] = np.nan
df_att.loc[df_att['BO1'] == 9, 'BO1'] = np.nan
df_att['BO1'] = df_att['BO1'].astype(CategoricalDtype(ordered=True))

In [None]:
# 1주일간 근력운동 일수 [BE5_1]] (순서)
from pandas.api.types import CategoricalDtype
df_att['BE5_1'] = df_att['BE5_1'].astype(CategoricalDtype(ordered=True))
df_att['sex'] = df_att['sex'].astype('category') #성별 범주화

## 결측치 처리
- 전체(남녀 통합) 데이터셋: df_all
- 남자 데이터셋: df_all_b
- 여자 데이터셋: df_all_g

In [None]:
# 결측치 제거
df_new = df_att.dropna(axis=0)
df_new = df_new.reset_index()
df_all = df_new.copy()

In [None]:
#남자 데이터 생성
df_all_b = df_all.loc[df_all["sex"]==1,:]
df_all_b = df_all_b.drop('sex', axis = 1)

In [None]:
#여자 데이터 생성
df_all_g = df_all.loc[df_all["sex"]==2,:]
df_all_g = df_all_g.drop('sex', axis = 1)
df_all = df_all.iloc[:,1:11]
df_all_g = df_all_g.iloc[:,1:11]
df_all_b = df_all_b.iloc[:,1:11]

## 이상치 제거

In [None]:
#outlier 제거 함수 생성
def remove_outlier(d_cp, column):
    fraud_column_data = d_cp[column]
    quan_25 = np.percentile(fraud_column_data, 25)
    quan_75 = np.percentile(fraud_column_data, 75)
    iqr = quan_75 - quan_25
    iqr = iqr * 1.5
    lowest = quan_25 - iqr
    highest = quan_75 + iqr
    outlier_index = fraud_column_data[(fraud_column_data<lowest) | (fraud_column_data > highest)].index
    d_cp.drop(outlier_index, axis=0, inplace=True)
    return d_cp

In [None]:
# outlier 제거 실행
df_all = remove_outlier(df_all, 'SitTime')
df_all = remove_outlier(df_all, 'Total_slp_wk')
df_all_g = remove_outlier(df_all_g, 'SitTime')
df_all_g = remove_outlier(df_all_g, 'Total_slp_wk')
df_all_b = remove_outlier(df_all_b, 'SitTime')
df_all_b = remove_outlier(df_all_b, 'Total_slp_wk')

# Train/Test 분할, 업샘플링, 스케일링

In [None]:
# Train_test 나누기와 업샘플링 그리고 scaling 적용 여부 함수 생성
def train_test_split_and_upsample(df_input, scaling):
    df = df_input.copy()
    df = df.astype('category')
    df['SitTime'] = df['SitTime'].astype('float64')
    df['Total_slp_wk'] = df['Total_slp_wk'].astype('float64')
    df['age'] = df['age'].astype('category')
    df_class_0 = df[df['BMI'] == 0]
    df_class_1 = df[df['BMI'] == 1]
    df_class_1_over = df_class_1.sample(df_class_0.shape[0], replace=True, random_state=10)
    df_over = pd.concat([df_class_0, df_class_1_over], axis=0)
    feature_columns = list(df.columns.difference(['BMI']))
    X = df[feature_columns]
    y = df[['BMI']]
    X_res = df_over[feature_columns]
    y_res = df_over[['BMI']
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y) 
    train_x_res, test_x_res, train_y_res, test_y_res = train_test_split(X_res, y_res, test_size = 0.3, random_state = 0, stratify = y_res)   #test train set 나누기
    train_x = train_x.reset_index().iloc[:,1:]
    tmp2 = train_x.loc[:,['Total_slp_wk', 'SitTime']]
    train_x = train_x.drop(['Total_slp_wk', 'SitTime'], axis=1)
    train_x = pd.concat([train_x, tmp2], axis=1)
    test_x = test_x.reset_index().iloc[:,1:]
    tmp2 = test_x.loc[:,['Total_slp_wk', 'SitTime']]
    test_x = test_x.drop(['Total_slp_wk', 'SitTime'], axis=1)
    test_x = pd.concat([test_x, tmp2], axis=1)
    train_y = train_y.reset_index()['BMI']
    test_y = test_y.reset_index()['BMI']
    train_x_res = train_x_res.reset_index().iloc[:,1:]
    tmp2 = train_x_res.loc[:,['Total_slp_wk', 'SitTime']]
    train_x_res = train_x_res.drop(['Total_slp_wk', 'SitTime'], axis=1)
    train_x_res = pd.concat([train_x_res, tmp2], axis=1)
    train_y_res = train_y_res.reset_index()['BMI']
    #sacling 파라미터 지정
    if scaling == True:
        min_max_scaler = preprocessing.MinMaxScaler()
        temp = min_max_scaler.fit_transform(train_x.loc[:,['Total_slp_wk','SitTime']])
        temp = pd.DataFrame(temp, columns = ['Total_slp_wk','SitTime'])
        train_x = train_x.drop(['Total_slp_wk', 'SitTime'], axis=1)
        train_x = pd.concat([train_x, temp], axis=1)
        min_max_scaler = preprocessing.MinMaxScaler()
        temp = min_max_scaler.fit_transform(test_x.loc[:,['Total_slp_wk','SitTime']])
        temp = pd.DataFrame(temp, columns = ['Total_slp_wk','SitTime'])
        test_x = test_x.drop(['Total_slp_wk', 'SitTime'], axis=1)
        test_x = pd.concat([test_x, temp], axis=1)
        min_max_scaler = preprocessing.MinMaxScaler()
        temp = min_max_scaler.fit_transform(train_x_res.loc[:,['Total_slp_wk','SitTime']])
        temp = pd.DataFrame(temp, columns = ['Total_slp_wk','SitTime'])
        train_x_res = train_x_res.drop(['Total_slp_wk', 'SitTime'], axis=1)
        train_x_res = pd.concat([train_x_res, temp], axis=1)    
    return train_x, test_x, train_y, test_y, train_x_res, train_y_res

In [None]:
# 오버 샘플링 데이터 출력
#df_all 대신 df_all_b, df_all_g를 넣으면 걔네들에 대해서 트레인, 테스트, 오버샘플링 출력
tmp = train_test_split_and_upsample(df_all, scaling=True) 
train_x = tmp[0]
train_y = tmp[2]
test_x = tmp[1]
test_y = tmp[3]
train_x_res = tmp[4]
train_y_res = tmp[5]

# 모델링 및 변수 중요도 파악

## PCA

In [None]:
#내가 원하는 기존train또는 oversample된 train을 구하기

#train_x = train_x_res.copy()
#train_y = train_y_res.copy()

train_x = make_pca_col(train_x_res)
train_y = train_y_res.copy()
test_x = make_pca_col(test_x)

## Random Forest

In [None]:
# RF 그리드서치 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV

#rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
#rf.fit(X_train, y_train)

#최적 파라미터 값 찾기
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 10, n_jobs = -1, scoring='recall')
grid_cv.fit(train_x, train_y)

predicted = grid_cv.predict(test_x)
predicted_proba = grid_cv.predict_proba(test_x)

pred_y_proba = []
pred_y_proba_list = grid_cv.predict_proba(test_x)
for i in range(pred_y_proba_list.shape[0]):
    pred_y_proba.append(pred_y_proba_list[i][1])

# AUC
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(test_y, pred_y_proba)

# Print ROC curve
plt.plot(fpr,tpr)

# Print AUC
auc = np.trapz(tpr,fpr)

print(confusion_matrix(test_y, predicted))
print(model_evaluation(test_y, predicted))
print('AUC:', round(auc,3))
plt.plot(fpr,tpr)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
#print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [None]:
# 변수중요도 추출1

import eli5
from eli5.sklearn import PermutationImportance 

perm = PermutationImportance(grid_cv, random_state = 0).fit(train_x, train_y) 
df_impt = eli5.show_weights(perm, top = 80, feature_names = train_x.columns.tolist())

In [None]:
# 변수중요도 추출2
df_impt

In [None]:
# 변수중요도 시각화
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
result = permutation_importance(grid_cv, train_x, train_y, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

plt.barh(train_x.columns[sorted_idx], sorted(result.importances_mean))
plt.title('Permutation Importance', fontsize=18)
plt.ylabel('Feature name', fontsize=15)
plt.show()

## 로지스틱 회귀모형

In [None]:
# 다중 공선성 확인용 코드 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
데이터프레임 = add_constant(데이터프레임)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(
    데이터프레임.values, i) for i in range(데이터프레임.shape[1])]
vif["features"] = 데이터프레임.columns
vif.T

In [None]:
# Performance Measure 출력 함수 만들기
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision) # Recall을 Precision보다 2배 중요하게 생각하여 F2 Score 사용
    
    print("Accuracy: ", Accuracy) 
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1_Score: ", F1_Score)
    print("F2_Score: ", F2_Score)

In [None]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
#K-평균 군집을 사용한 전처리 ==> 로지스틱 회귀
# 이 블록은 K-평균에서의 최적의 K값 찾는 것. 

param_grid = dict(kmeans__n_clusters = range(95,105+1))

pipeline = Pipeline([
                     ("kmeans", KMeans()),  # 
                     ("log_reg", LogisticRegression(max_iter=15000)),     
])

grid_clf = GridSearchCV(pipeline, param_grid)
grid_clf.fit(train_x_res, train_y_res.values.ravel())  # A column-vector y was passed when a 1d array was expected라는 에러가 떠서 .values.ravel() 이용해서 column vector를 1d array로 형태 변환해줌
print(grid_clf.best_params_)
print(grid_clf.score(test_x, test_y.values.ravel()))

In [None]:
# K Means 전처리하지 않고 그냥 그리드 서치 적용 및 l1 norm, l2 norm

# 파라미터 후보
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}

# 그리드 서치 진행
grid_search = GridSearchCV(LogisticRegression(solver='liblinear', random_state = 42), param_grid, cv= 2 )        

grid_search.fit(train_x, train_y.values.ravel())
grid_search.score(test_x, test_y.values.ravel())
print("best parameters : {}".format(grid_search.best_params_))
        

In [None]:
predicted = grid_search.predict(test_x)
print(confusion_matrix(test_y.values.ravel(), predicted))
print(model_evaluation(test_y.values.ravel(), predicted))
print("AUC: ", roc_auc_score(test_y.values.ravel(), np.round(predicted,0)))
print('='*40)

In [None]:
# K Means 전처리하지 않고 그냥 그리드 서치 적용 및 ElasticNet 사용 시

# 파라미터 후보
param_grid = {'C': [0.01, 0.1, 1, 10], 'l1_ratio' : [0.3, 0.6, 0.9]}

# 그리드 서치 진행
grid_search = GridSearchCV(LogisticRegression(solver='saga', random_state = 42,  max_iter = 5000, penalty= 'elasticnet'), param_grid, cv= 2)        

grid_search.fit(train_x, train_y.values.ravel())
grid_search.score(test_x, test_y.values.ravel())
print("best parameters : {}".format(grid_search.best_params_))
        
predicted = grid_search.predict(test_x)
print(confusion_matrix(test_y.values.ravel(), predicted))
print(model_evaluation(test_y.values.ravel(), predicted))
print("AUC: ", roc_auc_score(test_y.values.ravel(), np.round(predicted,0)))

#print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
print('='*40)

In [None]:
# K Means 전처리 후 그리드 서치 적용 및 l1 norm, l2 norm 사용 시

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

param_grid = {
    'log_reg__C': [0.01, 0.1, 1, 10],
    'log_reg__penalty' : ['l1', 'l2'],
}

pipeline = Pipeline([
                     ("kmeans", KMeans(n_clusters= 122 )),  # 
                     ("log_reg", LogisticRegression(solver='liblinear', max_iter = 20000)),     # 이 때 logistic Regression 안에 파라미터 넣던가 아니면 LogistricRegression 대신 GridSearCV를 쓰는 것도 시도해보기 
])

grid_clf = GridSearchCV(pipeline, param_grid, cv = 5)
grid_clf.fit(train_x_res, train_y_res.values.ravel())  # A column-vector y was passed when a 1d array was expected라는 에러가 떠서 .values.ravel() 이용해서 column vector를 1d array로 형태 변환해줌
print("best parameters : ", grid_clf.best_params_)
print("Grid Search Score: ", grid_clf.score(test_x, test_y.values.ravel()))

predicted = grid_clf.predict(test_x)
print(confusion_matrix(test_y.values.ravel(), predicted))
print(model_evaluation(test_y.values.ravel(), predicted))
print("AUC: ", roc_auc_score(test_y.values.ravel(), np.round(predicted,0)))

In [None]:
# K Means 전처리 후 그리드 서치 적용 및 ElasticNet 사용 시

param_grid = {
    'log_reg__C': [ 0.01, 0.1, 1, 10],
    'log_reg__l1_ratio' : [0.3, 0.6, 0.9],
}

pipeline = Pipeline([
                     ("kmeans", KMeans(n_clusters= 2 )),  # 
                     ("log_reg", LogisticRegression(solver='saga', penalty = 'elasticnet', max_iter = 20000)),     # 이 때 logistic Regression 안에 파라미터 넣던가 아니면 LogistricRegression 대신 GridSearCV를 쓰는 것도 시도해보기 
])

grid_clf = GridSearchCV(pipeline, param_grid, cv = 5)
grid_clf.fit(train_x_res, train_y_res.values.ravel())  # A column-vector y was passed when a 1d array was expected라는 에러가 떠서 .values.ravel() 이용해서 column vector를 1d array로 형태 변환해줌
print("best parameters : ", grid_clf.best_params_)
print("Grid Search Score: ", grid_clf.score(test_x, test_y.values.ravel()))

predicted = grid_clf.predict(test_x)
print(confusion_matrix(test_y, predicted))
print(model_evaluation(test_y, predicted))
print("AUC: ", roc_auc_score(test_y, np.round(predicted,0)))

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C = 1, penalty = 'l1', solver = 'liblinear', random_state = 42)
logreg.fit(train_x,train_y.values.ravel())

In [None]:
# 회귀계수 확인
coef_dict = {}
for coef, feat in zip(logreg.coef_[0,:],train_x.columns):
    coef_dict[feat] = coef

coef_dict

# 회귀계수 확인 + sklearn에서는 회귀계수를 검정하는 방법이 아직 나오지 않았기 때문에 statsmodels 패키지를 이용하여 설명변수의 p-value만 참고하기
# 단 이 코드의 결과는 선형 회귀분석 함수 (OLS)를 이용한 것이기 때문에 회귀계수는 참고하지 않고 p-value만 확인하는 용도로 사용. 
# 참고 사이트 : https://qastack.kr/programming/27928275/find-p-value-significance-in-scikit-learn-linearregression
import statsmodels.api as sm
X2 = sm.add_constant(train_x)
model = sm.OLS(train_y, X2)
result = model.fit()
print(result.summary())
df_all = df_all.drop(['Total_slp_wd_standard','SitTime_standard','Total_slp_wd_scaled','SitTime_scaled'], axis = 1)

## Support Vector Machine

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision)
    
    print("Accuracy: ", Accuracy)
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1-Score: ", F1_Score)
    print("F2-Score: ", F2_Score)
    print("auc score: " , roc_auc_score(label, np.round(predict,0)))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
#최적 파라미터 값 찾기
param_grid = [ { 'C' : [0.1, 1, 10 ], 'kernel': [ 'rbf' ], 'gamma' : [ 1,0.1 ]},
{ 'C' : [0.1, 1, 10 ], 'kernel': [ 'poly' ], 'gamma' : [ 1,0.1 ]},
                    ]

In [None]:
def svm_model(train_x, train_y,test_x, test_y,cv):
    print("데이터셋 : ", train_x ," &  cv: " , cv)
    grid_search = GridSearchCV(SVC(),param_grid, cv=cv, return_train_score = True)
    grid_search.fit(train_x, train_y)
    print("best parameters : {}".format(grid_search.best_params_))
    predicted = grid_search.predict(test_x)
    print(confusion_matrix(test_y, predicted))
    print(model_evaluation(test_y, predicted))
 

In [None]:
#전체
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#남
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#여
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#전체
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)
#남
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)
#여
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)

In [None]:
# 선형SVM

param_grid = [ { 'C' : [0.1, 1, 10,100, 1000 ], 'kernel': [ 'linear' ]}

def svm_model(train_x, train_y,test_x, test_y,cv):
    print("데이터셋 : ", train_x ," &  cv: " , cv)
    grid_search = GridSearchCV(SVC(),param_grid, cv=cv, return_train_score = True)
    grid_search.fit(train_x, train_y)
    print("best parameters : {}".format(grid_search.best_params_))
    predicted = grid_search.predict(test_x)
    print(confusion_matrix(test_y, predicted))
    print(model_evaluation(test_y, predicted))

#전체
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#남
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#여
svm_model(train_x, train_y, test_x, test_y, 5)
svm_model(train_x_res, train_y_res, test_x, test_y, 5)
#전체
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)
#남
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)
#여
svm_model(train_x, train_y, test_x, test_y, 10)
svm_model(train_x_res, train_y_res, test_x, test_y, 10)

# 가장 높은 파라미터 : C=1000

In [None]:
# 변수 중요도 확인
from sklearn import svm
import matplotlib.pyplot as plt
def feature_plot(classifier, feature_names, top_features):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    plt.figure(figsize=(18, 7))
    colors = ['green' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1 + 2 * top_features), feature_names[top_coefficients], rotation=45, ha='right')
    plt.show()

lienearsvm = svm.LinearSVC(C=1000).fit(train_x_res, train_y_res)
feature_plot(lienearsvm, value,4)


## MLP

In [None]:
# tensorflow, keras 라이브러리 호출
import tensorflow as tf
from tensorflow import keras

In [None]:
#입력층
input_ = keras.layers.Input(shape=train_x.shape[1:])
#은닉층
hidden1 = keras.layers.Dense(30, activation = 'relu')(input_)
hidden2 = keras.layers.Dense(30, activation = 'relu')(hidden1)
concat = keras.layers.Concatenate()([input_,hidden2])
#출력층
output = keras.layers.Dense(1, activation='sigmoid')(concat)
model = keras.Model(inputs = [input_], outputs=[output])
#모델 컴파일
model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))
#early stopping 추가
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights = True)
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_keras_model.h5", save_best_only=True)

In [None]:
#모델 훈련
model.fit(np.array(train_x.astype(float)), train_y, epochs=100, validation_data = (np.array(test_x.astype(float)), test_y), callbacks=[checkpoint_cb, early_stopping_cb])

In [None]:
#test set 예측
preds = model.predict(test_x.astype(float))
preds

In [None]:
#성능 측정
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [None]:
def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision)
    
    print("Accuracy: ", Accuracy)
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1-Score: ", F1_Score)
    print("F2-Score: ", F2_Score)
    print("auc score: " , roc_auc_score(label, np.round(predict,0)))

In [None]:
model_evaluation(test_y, np.round(preds,0))
value = train_x_res.columns.values