In [1]:
require(xgboost)
require(Matrix)
require(data.table)
require(CatEncoders)
require(caret)

Loading required package: xgboost
Loading required package: Matrix
Loading required package: data.table
Loading required package: CatEncoders

Attaching package: ‘CatEncoders’

The following object is masked from ‘package:base’:

    transform

Loading required package: caret
Loading required package: lattice
Loading required package: ggplot2


In [2]:
set.seed(777)

trainData <- read.csv(file="./train_kor.csv", header=T, fileEncoding="cp949")
cols = c("주야", "요일", "사망자수", "사상자수", "중상자수", "경상자수", 
         "부상신고자수", "발생지시도", "발생지시군구", "사고유형_대분류", 
         "사고유형_중분류", "법규위반","도로형태_대분류", "도로형태",
         "당사자종별_1당_대분류", "당사자종별_2당_대분류")
numeric_cols = c('사망자수', '사상자수', '중상자수', '경상자수', '부상신고자수')
categoric_cols = c("주야", "요일", "발생지시도", "발생지시군구", "사고유형_대분류", 
                   "사고유형_중분류", "법규위반","도로형태_대분류", "도로형태",
                   "당사자종별_1당_대분류", "당사자종별_2당_대분류")
trainData <- trainData[cols]

In [3]:
labelEncoder = list()

In [4]:
for (catName in categoric_cols) {
    labelEncoder[catName] <- LabelEncoder.fit(trainData[, catName])
}

“implicit list embedding of S4 objects is deprecated”

In [5]:
defaultFileName <- function(dropCols, guessCol) {
    dropCols <- union(dropCols, c(guessCol))
    probType <- 0
    for (i in 1:ncol(trainData)) {
        if (is.element(colnames(trainData)[i], dropCols)) {
            probType <- probType + 2^(i - 1)
        }
    }
    ansType <- match(guessCol, colnames(trainData))
    return(paste("./models/", "t", probType, "a", ansType, ".model", sep=""))
}

In [6]:
trainDataValidationSep <- 0.15

train <- function(dropCols, guessCol, 
                   fileName = defaultFileName(dropCols, guessCol),
                   data.train=NULL, data.val=NULL,
                   nrounds=1000, max_depth=5, eta=0.07, alpha=0.0, lambda=0.0,
                   subsample=1.0, colsample_bytree=1.0, verbose=0) {
    
    dropCols <- dropCols[dropCols != guessCol]
    if (is.null(data.train) && is.null(data.val)) {
        sample.ind = sample(2, nrow(trainData), replace = T, 
                            prob = c(1.0 - trainDataValidationSep,
                                     trainDataValidationSep))
        data.train = trainData[sample.ind==1,]  
        data.val = trainData[sample.ind==2,]
    } else if (is.null(data.train) || is.null(data.val)) {
        message("Error: Exactly one of data.train and data.val is NULL")
        return(NULL)
    }
    
    for(dropCol in dropCols) {
        data.train[dropCol] <- NULL
        data.val[dropCol] <- NULL
    }
            
    trainX <- sparse.model.matrix(as.formula(paste(guessCol, ".", sep=" ~ ")), data = data.train)[,-1]
    valX <- sparse.model.matrix(as.formula(paste(guessCol, ".", sep=" ~ ")), data = data.val)[,-1]
    
    if (is.element(guessCol, numeric_cols)) {
        trainY <- data.train[, guessCol]
        valY <- data.val[, guessCol]
    } else {
        data = rbind(data.train[guessCol], data.val[guessCol])
        trainY <- transform(labelEncoder[[guessCol]], data.train[, guessCol]) - 1
        valY <- transform(labelEncoder[[guessCol]], data.val[, guessCol]) - 1
    }
    
    dtrain <- xgb.DMatrix(data=trainX, label=trainY)
    dval <- xgb.DMatrix(data=valX, label=valY)
    watchlist <- list(val=dval)
    
    if (is.element(guessCol, numeric_cols)) {
        bst <- xgb.train(data = dtrain, watchlist=watchlist, 
                         max_depth = max_depth,
                         eta = eta, nthread = 4, nrounds = nrounds,
                         objective = "reg:linear",
                         subsample=subsample,
                         colsample_bytree = colsample_bytree,
                         early_stopping_rounds=5,
                        verbose=verbose)
        
        xgb.save(bst, fileName)
        
        predValY <- predict(bst, valX)
        resDf = data.frame(pred=predValY, real=valY)
        return(list(result=resDf, rmse=mean((resDf$pred - resDf$real)^2)^0.5))
    } else {
        bst <- xgb.train(data = dtrain, watchlist=watchlist, 
             max_depth = max_depth,
             eta = eta, nthread = 4, nrounds = nrounds,
             num_class = nlevels(data[, guessCol]),
             objective = "multi:softmax",
             subsample=subsample,
             colsample_bytree = colsample_bytree,
             early_stopping_rounds=5,
            verbose=verbose)
        
        xgb.save(bst, fileName)
        
        predValY <- predict(bst, valX)
        resDf = data.frame(pred=inverse.transform(labelEncoder[[guessCol]], predValY + 1), 
                           real=inverse.transform(labelEncoder[[guessCol]], valY + 1))
        return(list(result=resDf, acc=sum(predValY == valY) / nrow(resDf)))
    }
}

In [7]:
use <- function(dropCols, guessCol, data, bst) {
    
    for(dropCol in dropCols) {
        data[dropCol] <- NULL
    }
    data[guessCol] <- NULL
        
    testX <- sparse.model.matrix(as.formula(paste(" ~ .")), data = data)[,-1]
    
    if (is.element(guessCol, numeric_cols)) {
        predTestY <- predict(bst, testX)
        return(predTestY)
    } else {
        predTestY <- predict(bst, testX)
        return(inverse.transform(labelEncoder[[guessCol]], predTestY + 1))
    }
}

Use the trained models

In [8]:
testData <- read.csv(file="./final_test.csv", header=T, fileEncoding="cp949", na.strings=c("","NA"))

In [9]:
isna <- function(x) {
    if (is.na(x)) {
        return(NA)
    } else {
        return(x)
    }
}

In [10]:
testData$주야 = factor(lapply(as.character(testData$"주야"), isna), levels=levels(trainData$주야))
testData$요일 = factor(lapply(as.character(testData$"요일"), isna), levels=levels(trainData$요일))
testData$발생지시도 = factor(lapply(as.character(testData$"발생지시도"), isna), levels=levels(trainData$발생지시도))
testData$발생지시군구 = factor(lapply(as.character(testData$"발생지시군구"), isna), levels=levels(trainData$발생지시군구))
testData$사고유형_대분류 = factor(lapply(as.character(testData$"사고유형_대분류"), isna), levels=levels(trainData$사고유형_대분류))
testData$사고유형_중분류 = factor(lapply(as.character(testData$"사고유형_중분류"), isna), levels=levels(trainData$사고유형_중분류))
testData$법규위반 = factor(lapply(as.character(testData$"법규위반"), isna), levels=levels(trainData$법규위반))
testData$도로형태_대분류 = factor(lapply(as.character(testData$"도로형태_대분류"), isna), levels=levels(trainData$도로형태_대분류))
testData$도로형태 = factor(lapply(as.character(testData$"도로형태"), isna), levels=levels(trainData$도로형태))
testData$당사자종별_1당_대분류 = factor(lapply(as.character(testData$"당사자종별_1당_대분류"), isna), levels=levels(trainData$당사자종별_1당_대분류))
testData$당사자종별_2당_대분류 = factor(lapply(as.character(testData$"당사자종별_2당_대분류"), isna), levels=levels(trainData$당사자종별_2당_대분류))

In [11]:
ansData = testData

In [12]:
testData$type <- rep(0, nrow(testData))
for (l in 1:(ncol(testData) - 1)) {
    testData$type <- testData$type + (2 ^ (l-1)) * is.na(testData[, l])
}

In [13]:
testDataByType = by(testData, testData[,"type"], function(x) x)

In [14]:
for (typeStr in names(testDataByType)) {
    probData = testDataByType[[typeStr]]
    type = as.integer(typeStr)
    ilist = c()
    for (i in 1:16) {
        if (bitwAnd(2^(i-1), type) != 0) {
            ilist = c(ilist, i)
        }
    }
    dropCols = cols[ilist]
    
    for (guessCol in dropCols) {
        if (guessCol != '발생지시군구') {
            fileName = paste("./models/", "t", typeStr, "a", match(guessCol, cols), ".model", sep="")
            if (!file.exists(fileName)) {
                train(dropCols, guessCol)
            }
            bst = xgb.load(fileName)

            curData = probData
            curData$type = NULL
            #print(head(curData))

            if (nrow(probData) == 1) {
                curData = rbind(curData, curData)
                probData[, guessCol] = head(use(dropCols, guessCol, curData, bst), 1)
            } else {
                probData[, guessCol] = use(dropCols, guessCol, curData, bst)
            }
        } else {
            # TODO: fill in here
        }
    }
    
    for (row in rownames(probData)) {
        curAns = probData[row, ]
        curAns$type = NULL
        ansData[row, ] = curAns
    }
    
    message(paste("type", type, "done"))
}

type 28 done
type 44 done
type 88 done
type 2432 done
type 3085 done
type 12288 done
type 14336 done
type 16396 done
type 16412 done
type 16712 done
type 18032 done
type 18816 done
type 19456 done
type 19468 done
type 32781 done
type 32880 done
type 35952 done
type 36352 done
type 49180 done
type 50688 done


In [15]:
td = trainData[c('발생지시도', '발생지시군구')]
jisidoGuess = by(td, td[,"발생지시도"], function(x) names(sort(table(x$발생지시군구), decreasing=TRUE)[1]))

In [16]:
for (i in 1:nrow(ansData)) {
    if (is.na(ansData[i, "발생지시군구"])) {
        ansData[i, "발생지시군구"] = jisidoGuess[[ansData[i, "발생지시도"]]]
    }
}

Write the answer to file

In [20]:
result <- read.csv(file="./final_result.csv", header=T, fileEncoding="cp949", na.strings=c("","NA"))

In [21]:
result

row,column,예측값
2,E,
2,F,
2,G,
2,J,
2,K,
2,O,
3,K,
3,L,
3,O,
4,E,


In [22]:
for (i in 1:nrow(result)) {
    row = result[i, "row"] - 1
    column = utf8ToInt(as.character(result[i, "column"])) - utf8ToInt("A") + 1
    if (3 <= column && column <= 7) {
        result[i, "예측값"] = round(ansData[row, column])
    } else {
        result[i, "예측값"] = as.character(ansData[row, column])
    }
}

In [25]:
write.csv(result, file="./result.csv", row.names=FALSE)