In [1]:
import numpy as np
import pandas as pd

Load dataframe and set up column lists

In [2]:
train_df = pd.read_csv("./train_kor.csv", encoding="euc-kr")
test_df = pd.read_csv("./test_kor.csv", encoding="euc-kr")

In [3]:
train_df = train_df.sample(frac=1)

In [4]:
cols = np.array(["주야", "요일", "사망자수", "사상자수", "중상자수", "경상자수", "부상신고자수", "발생지시도", "발생지시군구", "사고유형_대분류", "사고유형_중분류", "법규위반","도로형태_대분류", "도로형태","당사자종별_1당_대분류", "당사자종별_2당_대분류"])
numeric_cols = np.array(['사망자수', '사상자수', '중상자수', '경상자수', '부상신고자수'])
categoric_cols = np.array(["주야", "요일", "발생지시도", "발생지시군구", "사고유형_대분류", "사고유형_중분류", "법규위반","도로형태_대분류", "도로형태","당사자종별_1당_대분류", "당사자종별_2당_대분류"])

In [5]:
train_df = train_df[cols]

Fit a LabelEncoder for categoric input

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
category_le = LabelEncoder()

In [8]:
train_df_values = np.ndarray.flatten(train_df[categoric_cols].values)
test_df_values = np.ndarray.flatten(test_df[categoric_cols].values)
all_values = np.concatenate((train_df_values, test_df_values))

In [9]:
def isnan(v):
    return isinstance(v, float) and np.isnan(v)

In [10]:
not_nan_flag = np.array(list(map(lambda x: not isnan(x), all_values)))

In [11]:
all_values = np.extract(not_nan_flag, all_values)

In [12]:
category_le.fit(all_values)

LabelEncoder()

Parse the test

In [13]:
def bool_arr_to_int(arr):
    return np.sum([(2 ** i if b else 0) for i, b in enumerate(arr)])

In [14]:
def in_out_cols(bit):
    in_cols, out_cols = [], []
    for i in range(len(cols)):
        if (bit & (2 ** i)) > 0:
            out_cols.append(cols[i])
        else:
            in_cols.append(cols[i])
    return in_cols, out_cols

In [15]:
test_df['Problem Type'] = test_df.apply(lambda x: bool_arr_to_int(map(isnan, x)), axis=1)

In [16]:
def prob(bit, cur_test_df):
    in_cols, out_cols = in_out_cols(bit)
    num_df = cur_test_df[np.intersect1d(in_cols, numeric_cols)]
    cat_df = cur_test_df[np.intersect1d(in_cols, categoric_cols)]
    return (num_df, cat_df, out_cols)

In [17]:
problems = [prob(bit, cur_test_df)
    for bit, cur_test_df in list(test_df.groupby('Problem Type'))]

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

Using TensorFlow backend.


In [19]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

In [20]:
# Returns the Sequence of prediction 
def predict(num_df, cat_df, predict_col):
    print(list(num_df.index))
    print(predict_col)
    
    cat_cols = cat_df.columns.values
    num_cols = num_df.columns.values
    train_cat_mat = train_df[cat_cols].values
    train_cat_mat = category_le.transform(train_cat_mat.ravel()).reshape(train_cat_mat.shape)
    cat_mat = cat_df.values
    cat_mat = category_le.transform(cat_mat.ravel()).reshape(cat_mat.shape)
    train_num_mat = train_df[num_cols].values
    num_mat = num_df.values
    
    cat_ohe = OneHotEncoder()
    cat_ohe.fit(np.concatenate((train_cat_mat, cat_mat), axis=0))
    train_cat_mat = cat_ohe.transform(train_cat_mat)
    cat_mat = cat_ohe.transform(cat_mat)
    
    trainX = np.concatenate((train_cat_mat.toarray(), train_num_mat), axis=1)
    testX = np.concatenate((cat_mat.toarray(), num_mat), axis=1)
    trainY = train_df[predict_col]
    
    if predict_col in numeric_cols:
        model = Sequential()
        model.add(Dense(500, input_shape=(trainX.shape[1],), init="normal", activation="relu"))
        model.add(Dense(500, init="normal", activation="relu"))
        model.add(Dense(500, init="normal", activation="relu"))
        model.add(Dense(1, init="normal", activation="relu"))

        model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

        model.fit(trainX[:], trainY[:], epochs=3, batch_size=20, verbose=0)
        
        prediction = model.predict(testX)
        for i, idx in enumerate(cat_df.index):
            # Set the predicted value to test data frame
            test_df.set_value(idx, predict_col, np.round(prediction[i, 0]))
    else:
        trainY = category_le.transform(trainY)
        lb = LabelBinarizer()
        trainY = lb.fit_transform(trainY)
        
        model = Sequential()
        model.add(Dense(500, input_shape=(trainX.shape[1],), init="normal", activation="relu"))
        model.add(Dense(500, init="normal", activation="relu"))
        model.add(Dense(500, init="normal", activation="relu"))
        model.add(Dense(trainY.shape[1], init="normal", activation="softmax"))

        if (trainY.shape[1] == 1):
            model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])            
        else:
            model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
        
        model.fit(trainX[:], trainY[:], epochs=3, batch_size=20, verbose=0)
        
        prediction = model.predict(testX)
        prediction = category_le.inverse_transform(lb.inverse_transform(prediction))
        for i, idx in enumerate(cat_df.index):
            # Set the predicted value to test data frame
            test_df.set_value(idx, predict_col, prediction[i])

In [21]:
total_probs = 0
for num_df, cat_df, out_cols in problems:
    total_probs += len(num_df.index) * len(out_cols)

In [22]:
cnt = 0
for num_df, cat_df, out_cols in problems:
    for out_col in out_cols:
        cnt += len(out_cols)
        print(str(cnt) + "/" + str(total_probs))
        predict(num_df, cat_df, out_col)

3/157
[7, 8, 9]
사망자수




6/157
[7, 8, 9]
사상자수
9/157
[7, 8, 9]
중상자수
12/157
[0, 1]
사망자수
15/157
[0, 1]
사상자수
18/157
[0, 1]
경상자수
21/157
[4, 5, 6]
사상자수
24/157
[4, 5, 6]
중상자수
27/157
[4, 5, 6]
경상자수
30/157
[2, 3]
사상자수
33/157
[2, 3]
중상자수
36/157
[2, 3]
부상신고자수
39/157
[30, 31]
사망자수




42/157
[30, 31]
사상자수
45/157
[30, 31]
발생지시군구




48/157
[32, 33, 34]
중상자수
51/157
[32, 33, 34]
경상자수
54/157
[32, 33, 34]
발생지시군구
56/157
[35, 36, 37, 38, 39]
발생지시도




58/157
[35, 36, 37, 38, 39]
발생지시군구
61/157
[40, 41]
요일




64/157
[40, 41]
사고유형_대분류




67/157
[40, 41]
사고유형_중분류




70/157
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
사고유형_대분류




73/157
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
사고유형_중분류
76/157
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
법규위반




80/157
[42, 43, 44]
요일




84/157
[42, 43, 44]
사고유형_중분류
88/157
[42, 43, 44]
법규위반
92/157
[42, 43, 44]
도로형태_대분류




96/157
[47, 48, 49]
주야




100/157
[47, 48, 49]
사상자수




104/157
[47, 48, 49]
중상자수
108/157
[47, 48, 49]
도로형태




112/157
[45, 46]
주야




116/157
[45, 46]
사망자수




120/157
[45, 46]
사상자수
124/157
[45, 46]
당사자종별_1당_대분류




127/157
[20, 21, 22]
도로형태_대분류




130/157
[20, 21, 22]
도로형태
133/157
[20, 21, 22]
당사자종별_1당_대분류
136/157
[23, 24, 25]
도로형태_대분류




139/157
[23, 24, 25]
도로형태
142/157
[23, 24, 25]
당사자종별_2당_대분류




146/157
[26, 27, 28, 29]
도로형태_대분류




150/157
[26, 27, 28, 29]
도로형태
154/157
[26, 27, 28, 29]
당사자종별_1당_대분류
158/157
[26, 27, 28, 29]
당사자종별_2당_대분류


Parse the result

In [39]:
result_df = pd.read_csv("./result_kor.csv", encoding="euc-kr")

In [40]:
def ans(row, col):
    return test_df.at[row - 2, cols[ord(col) - ord('A')]]

In [41]:
ans_list = np.array(list(map(lambda a: ans(a[0], a[1]), result_df.values)))

In [42]:
result_df['값'] = ans_list

In [43]:
result_df.to_csv('./result_kor.csv', encoding="euc-kr", index=False)

In [44]:
result_df = pd.read_csv("./result_kor.csv", encoding="euc-kr")

In [45]:
result_df

Unnamed: 0,행,열,값
0,2,C,1.0
1,2,D,2.0
2,2,F,0.0
3,3,C,1.0
4,3,D,1.0
5,3,F,0.0
6,4,D,1.0
7,4,E,0.0
8,4,G,0.0
9,5,D,4.0
