In [1]:
# data load
import warnings
warnings.filterwarnings('ignore')
import json
import pickle
import datetime

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

RANDOM_SEED = 2018

ori_data = pd.read_csv('dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', encoding='euc-kr', engine='python')
val_data = pd.read_csv('val_refine.csv', encoding='euc-kr', engine='python')
test_data = pd.read_csv('test_kor.csv', encoding='euc-kr', engine='python')

train_data = ori_data.dropna()
target_col_list = [i.strip() for i in 
                   '사상자수, 사망자수, 중상자수, 경상자수, 부상신고자수, 주야, 요일, 발생지시도, 발생지시군구, \
                   사고유형_대분류, 사고유형_중분류, 법규위반, 도로형태_대분류, 도로형태, \
                   당사자종별_1당_대분류, 당사자종별_2당_대분류'.split(',')]

In [91]:
# local function

from math import exp

def official_numerical_score(pred, real, B=1, s=1):
    return B * sum([exp(-((n-m)/s)**2) for n, m in zip(pred, real)])

def official_categorical_score(pred, real, C=1):
    return C * sum([1 if ci == di else 0 for ci, di in zip(pred, real)])

def refine_val(df, target_col, dependent_col_list):
    for t_col in target_col_list:
        df = df[[isinstance(x, float) and np.isnan(x) for x in df[t_col]]]
    for d_col in dependent_col_list:
        df = df[[not (isinstance(x, float) and np.isnan(x)) for x in df[d_col]]]
    return df

In [92]:
# templete function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def preprocessing(data):
    onehot_col = []

    for col in data.columns:
        if isinstance(data[col].values[0], str):
            onehot_col.append(col)

    data = pd.get_dummies(data, prefix=onehot_col)
    return data

class CustomRegModel:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=20, max_depth=5, random_state=RANDOM_SEED)
    def train(self, X, y, params=None):
        if params is not None:
            self.model_tmp = self.model
            self.model = GridSearchCV(self.model_tmp, params, cv=5,
                                      scoring=make_scorer(official_numerical_score,
                                                          greater_is_better=True,
                                                          needs_proba=False))
        self.model.fit(X, y)
    def predict(self, X):
        self.predReal = self.model.predict(X)
        self.pred = [round(x) for x in self.predReal]
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['MSE'] = mean_squared_error(test_y, pred)
        self.ev['Official Numerical Score'] = official_numerical_score(test_y, pred)
        return self.ev

class CustomClfModel:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=RANDOM_SEED)
    def train(self, X, y, params=None):
        if params is not None:
            self.model_tmp = self.model
            self.model = GridSearchCV(self.model_tmp, params, cv=5,
                                      scoring=make_scorer(official_categorical_score,
                                                          greater_is_better=True,
                                                          needs_proba=False))
        self.model.fit(X, y)
    def predict(self, X):
        self.pred = self.model.predict(X)
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['ACC'] = accuracy_score(test_y, pred)
        self.ev['Official Categorical Score'] = official_categorical_score(test_y, pred)
        return self.ev

In [None]:
%%time

# main
model_meta_dict = {}
col_record = {}

for _, row in test_data.iterrows():
    target_col_list = []
    dependent_col_list = []
    for key, value in row.items():
        if isinstance(value, float) and np.isnan(value):
            target_col_list.append(key)
        else:
            dependent_col_list.append(key)
    
    for t_col in target_col_list:
        col_key = t_col + ':' + '/'.join(dependent_col_list)
        if col_record.get(col_key):
            continue
        model_dict = {}
        model_dict['target_col'] = t_col
        model_dict['dependent_col_list'] = dependent_col_list

        if val_data is not None:
            tmp_test_data = test_data.copy()
            tmp_test_data = refine_val(tmp_test_data, target_col_list, dependent_col_list)
            model_dict['val_data_loc'] = tmp_test_data.index
    
        model_meta_dict[col_key] = model_dict
        col_record[col_key] = 1

save_dict = {}
loss_all_val1 = 0
if val_data is not None:
    loss_all_val2 = 0

tuned_parameters = {'n_estimators' : [3, 10, 30, 100, 300, 1000]}

#tuned_parameters = None


for i, model_dict_key in enumerate(model_meta_dict):
    data = ori_data.copy()
    model_dict = model_meta_dict[model_dict_key]
    target_col = model_dict['target_col']
    dependent_col_list = model_dict['dependent_col_list']
    if val_data is not None:
        tmp_val_data = val_data.iloc[model_dict['val_data_loc']]
    result = {
        'target_col' : target_col,
        'dependent_col' : dependent_col_list
    }
    print(str(i+1) + '/' + str(len(model_meta_dict)))
    print('key : ', model_dict_key)
    print('loc : ', model_dict['val_data_loc'].values)
    print('tar_col : ', target_col)
    print('dep_col : ', dependent_col_list)

    data['val'] = 0

    if tmp_val_data is not None:
        tmp_val_data['val'] = 1
        data = data.append(tmp_val_data)

    X = data[dependent_col_list + ['val']]
    y = data[[target_col, 'val']]

    X = preprocessing(X)

    if tmp_val_data is not None:
        val_X = X[X['val'] == 1]
        val_y = y[y['val'] == 1]
        X = X[X['val'] == 0]
        y = y[y['val'] == 0]

        val_X = val_X.drop(['val'], axis=1)
        val_y = val_y[target_col].values

    X = X.drop(['val'], axis=1)
    y = y[target_col].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_SEED)
    
    if isinstance(y_test[0], str): # clf
        model = CustomClfModel()
    else:
        model = CustomRegModel()
    model.train(X_train, y_train, tuned_parameters)
    print('best parameter : ', model.model.best_params_)
    pred = model.predict(X_test)
    val_eval = model.evaluation(pred, y_test)
    print('val1 : ', val_eval)
    result['val_1'] = val_eval
    loss_all_val1 = val_eval

    if tmp_val_data is not None:
        val_pred = model.predict(val_X)
        val_eval = model.evaluation(val_pred, val_y)
        print('val2 : ', val_eval)
        result['val_2'] = val_eval
        official_score_key = [v for v in val_eval if 'Official' in v][0]
        loss_all_val2 += val_eval[official_score_key]
        print('val2 shape : ', tmp_val_data.shape)

    print()
    save_dict[i] = result
    
    model_meta_dict[model_dict_key]['model'] = model

save_dict['loss_all_val1'] = loss_all_val1
save_dict['loss_all_val2'] = loss_all_val2

now_string = str(datetime.datetime.now())
now_string = now_string.replace(':', '-')
now_string = str(round(loss_all_val2, 4)).zfill(8) + ' ' + now_string
with open('result/' + now_string + '.json', 'w') as f:
    json.dump(save_dict, f)
with open('result/' + now_string + '.p', 'wb') as f:
    pickle.dump(model_dict, f)

1/48
key :  사망자수:주야/요일/중상자수/부상신고자수/발생지시도/발생지시군구/사고유형_대분류/사고유형_중분류/법규위반/도로형태_대분류/도로형태/당사자종별_1당_대분류/당사자종별_2당_대분류
loc :  [0 1]
tar_col :  사망자수
dep_col :  ['주야', '요일', '중상자수', '부상신고자수', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']


In [None]:
# check save file
with open('result/' + now_string + '.json', 'r') as f:
    json_ = json.load(f)
    
with open('result/' + now_string + '.p', 'rb') as f:
    model_ = pickle.load(f)

In [None]:
debug_log = model_meta_dict['사고유형_대분류:주야/요일/사망자수/사상자수/중상자수/경상자수/부상신고자수/발생지시도/발생지시군구/도로형태_대분류/도로형태/당사자종별_1당_대분류/당사자종별_2당_대분류']['val_data_loc']

In [29]:
test_data.iloc[debug_log]

Unnamed: 0,주야,요일,사망자수,사상자수,중상자수,경상자수,부상신고자수,발생지시도,발생지시군구,사고유형_대분류,사고유형_중분류,법규위반,도로형태_대분류,도로형태,당사자종별_1당_대분류,당사자종별_2당_대분류
10,주간,수,1.0,2.0,1.0,0.0,0.0,경남,의령군,,,,교차로,교차로내,이륜차,화물차
11,야간,화,2.0,3.0,1.0,0.0,0.0,대전,중구,,,,단일로,기타단일로,원동기장치자전거,없음
12,야간,화,1.0,1.0,0.0,0.0,0.0,대구,북구,,,,단일로,기타단일로,승용차,화물차
13,야간,화,1.0,1.0,0.0,0.0,0.0,울산,울주군,,,,단일로,기타단일로,승용차,보행자
14,주간,화,1.0,5.0,1.0,3.0,0.0,경기,수원시,,,,단일로,교량위,화물차,승용차
15,주간,화,1.0,1.0,0.0,0.0,0.0,전북,완주군,,,,교차로,교차로횡단보도내,승합차,보행자
16,주간,화,1.0,6.0,3.0,2.0,0.0,경남,거창군,,,,교차로,교차로내,승용차,승합차
17,야간,화,1.0,1.0,0.0,0.0,0.0,충남,천안시,,,,단일로,기타단일로,승용차,보행자
18,주간,화,1.0,11.0,2.0,8.0,0.0,충남,서산시,,,,단일로,기타단일로,화물차,승용차
19,주간,화,1.0,1.0,0.0,0.0,0.0,전남,곡성군,,,,교차로,교차로내,승합차,보행자
