In [None]:
library(tidyverse)
library(gridExtra)
library(ROCR)
library(ggplot2)
library(MASS)
library(glmnet)
library(randomForest)
library(rpart)
library(e1071)
library(nnet)
library(dplyr)

### 1. 데이터 읽기

In [12]:
# library(tidyverse)
anal = read.csv("파일명", header = TRUE)  # no missing values

#### 1.1 데이터 확인

In [None]:
dim(anal)
summary(anal)

#### 1.2 결측치 처리

##### 1) 평균화

In [4]:
cols = c(names(anal)[colSums(is.na(anal)) > 0])
cols
for (col in cols) {
  means <- mean(anal[[col]], na.rm = TRUE)  
  anal[[col]][is.na(anal[[col]])] <- means
}

In [5]:
anal1 = anal
anal1[, 12:length(anal1)] = lapply(anal[, 12:length(anal)], function(x) as.factor(ifelse(x == 0, "NO", "YES")))

### 2.셋 분할

In [6]:
#-------------------------------------------------------
set.seed(1)

n = nrow(anal1)
id = 1:n
train_id = sample(id, n*0.6)
valid_id = sample(id[-train_id], n*0.2)
test_id = sample(id[-c(train_id, valid_id)])

#-------------------------------------------------------

# x,y가 한 셋이 있는 셋은 데이터프레임 형식으로
train = anal1[train_id,]
validation = anal1[valid_id,]

# x,y가 한 분뢰 있는 셋은 x는 매트릭스,  y는 숫자형
XX = model.matrix(kidney_yn ~ ., anal1)[,-1]

x_train = XX[train_id,]
y_train = ifelse(train$kidney_yn == "YES", 1, 0)

x_validation = XX[valid_id,]
y_validation = ifelse(validation$kidney_yn == "YES", 1, 0)

### 3. 적합
> 로지스틱 정규화 모형은 X, y 매트릭스 형식  
> 랜덤포레스트, NN은 X,y가 같이있는 데이터프레임 형식

In [7]:
set.seed(1)

# 라쏘
fit_l1 = cv.glmnet(x_train, y_train, nfolds=10, alpha = 1, family='binomial')
yhat_l1 = predict(fit_l1, s="lambda.1se", newx= x_validation, type='response')

# EN
fit_l.5 = cv.glmnet(x_train, y_train, nfolds=10, alpha = 0.5, family='binomial')
yhat_l.5 = predict(fit_l.5, s="lambda.1se", newx= x_validation, type='response')

# 릿지
fit_l2 = cv.glmnet(x_train, y_train, nfolds=10, alpha = 0, family='binomial')
yhat_l2 = predict(fit_l2, s="lambda.1se", newx= x_validation, type='response')

# 랜덤포레스트
fit_RF = randomForest(kidney_yn ~ . , train)
yhat_RF = predict(fit_RF, newdata = validation, type = 'prob')[,2]

# SVM
fit_SVM = svm(kidney_yn ~ ., data=train, gamma=0.001, cost = 10, probability=T)
yhat_SVM = attr(predict(fit_SVM, newdata=validation, probability = TRUE),"probabilities")[,1]

# NN
fit_NN = nnet(kidney_yn ~ ., train, size=10)
yhat_NN = predict(fit_NN, newdata=validation, type="raw")


# weights:  221
initial  value 1928.754702 
iter  10 value 1552.449084
iter  20 value 1529.381120
iter  30 value 1494.513490
iter  40 value 1472.187084
iter  50 value 1457.636536
iter  60 value 1437.105959
iter  70 value 1426.958487
iter  80 value 1422.342414
iter  90 value 1416.325869
iter 100 value 1408.325402
final  value 1408.325402 
stopped after 100 iterations


### 3. 평가 지표

#### 3.1 이항 편차

In [None]:
# defining binomial deviance function

binomial_deviance <- function(y_obs, yhat){
  epsilon = 0.0001
  yhat = ifelse(yhat < epsilon, epsilon, yhat)
  yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
  a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
  b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
  return(2*sum(a + b))
}
binomial_deviance(y_validation, yhat_l1)
binomial_deviance(y_validation, yhat_l.5)
binomial_deviance(y_validation, yhat_l2)
binomial_deviance(y_validation, yhat_RF)

#### 3.2 AUC, ROC curve

In [None]:
pred_l1 <- prediction(yhat_l1, y_validation)
perf_l1 <- performance(pred_l1, measure = "tpr", x.measure = "fpr")

pred_l.5 <- prediction(yhat_l.5, y_validation)
perf_l.5 <- performance(pred_l.5, measure = "tpr", x.measure = "fpr")

pred_l2 <- prediction(yhat_l2, y_validation)
perf_l2 <- performance(pred_l2, measure = "tpr", x.measure = "fpr")

pred_RF <- prediction(yhat_RF, y_validation)
perf_RF <- performance(pred_RF, measure = "tpr", x.measure = "fpr")

pred_SVM <- prediction(yhat_SVM, y_validation)
perf_SVM <- performance(pred_SVM, measure = "tpr", x.measure = "fpr")

pred_NN <- prediction(yhat_NN, y_validation)
perf_NN <- performance(pred_NN, measure = "tpr", x.measure = "fpr")

In [None]:
# ROC
plot(perf_l1, col=1, lwd=2, lty = "solid",main="ROC Curve for different predictive models")
plot(perf_l.5, col=2, lwd=2, add=T)
plot(perf_l2, col=3, lwd=2, add=T)
plot(perf_RF, col=4, lwd=2, add=T)
plot(perf_SVM, col=5, lwd=2, add=T)
plot(perf_NN, col=6, lwd=2, add=T)
abline(0,1,lty="dashed")

# AUC
A1=performance(pred_l1, "auc")@y.values[[1]]
A2=performance(pred_l.5, "auc")@y.values[[1]]
A3=performance(pred_l2, "auc")@y.values[[1]]
A4=performance(pred_RF, "auc")@y.values[[1]]
A5=performance(pred_SVM, "auc")@y.values[[1]]
A6=performance(pred_NN, "auc")@y.values[[1]]

L1 = c("Lasso", "EN" , "Ridge" , "RF", "SVM", "NN")


L1 = str_pad(L1,width=max(nchar(L1)),side="right")

L2 = format(round(c(A1,A2,A3,A4,A5,A6),4))

L = paste(L1,L2," ")

#lty=c("solid","f8","longdash", "dotdash","dotted","twodash","11")
# 'black','red','darkgreen','green','blue','orange','violet'
legend(0.55, 0.3, legend=L1,bty="n",
       col=c(1,2,3,4,5,6), lwd=2)

legend(0.8, 0.3, legend=paste("(",L2,")",sep=""),bty="n")

#### 3.3 변수 중요도

##### `-` 로지스틱 정규화: 계수

In [None]:
coef(fit_l1, s = fit_l1$lambda.1se)

#### `-` 랜덤포레스트: MeanDecreaseGini

In [None]:
feature_importance = importance(fit_RF)
varImpPlot(fit_RF)
head(round(feature_importance[order(-feature_importance[,1]), 1, drop=FALSE], 4), n=10)