In [35]:
# data load
import warnings
warnings.filterwarnings('ignore')
import json
import pickle
import datetime

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

RANDOM_SEED = 2018
n_core = 4

ori_data = pd.read_csv('dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', encoding='euc-kr', engine='python')
val_data = pd.read_csv('val_refine.csv', encoding='euc-kr', engine='python')
test_data = pd.read_csv('test_kor.csv', encoding='euc-kr', engine='python')

train_data = ori_data.dropna()
target_col_list = [i.strip() for i in 
                   '사상자수, 사망자수, 중상자수, 경상자수, 부상신고자수, 주야, 요일, 발생지시도, 발생지시군구, \
                   사고유형_대분류, 사고유형_중분류, 법규위반, 도로형태_대분류, 도로형태, \
                   당사자종별_1당_대분류, 당사자종별_2당_대분류'.split(',')]

In [36]:
# local function

from math import exp

def official_numerical_score(pred, real, B=1, s=1):
    return B * sum([exp(-((n-m)/s)**2) for n, m in zip(pred, real)])

def official_categorical_score(pred, real, C=1):
    return C * sum([1 if ci == di else 0 for ci, di in zip(pred, real)])

def refine_val(df, target_col, dependent_col_list):
    for t_col in target_col_list:
        df = df[[isinstance(x, float) and np.isnan(x) for x in df[t_col]]]
    for d_col in dependent_col_list:
        df = df[[not (isinstance(x, float) and np.isnan(x)) for x in df[d_col]]]
    return df

In [37]:
# templete function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterGrid

def preprocessing(data):
    onehot_col = []

    for col in data.columns:
        if isinstance(data[col].values[0], str):
            onehot_col.append(col)

    data = pd.get_dummies(data, prefix=onehot_col)
    return data

class CustomHyperParameterTuner:
    def __init__(self, model, params, eval_fun, greater_is_better=True, cv=5):
        self.model = model
        self.params = params
        self.eval_fun = eval_fun
        self.greater_is_better = greater_is_better
        self.cv = cv
        self._best_model = None
        self.best_params_ = None
        self.best_score_ = None

    def fit(self, X, y, val_X=None, val_y=None):
        if val_X is None or val_y is None:
            self.model = GridSearchCV(self.model, self.params, cv=self.cv,
                                      scoring=make_scorer(self.eval_fun,
                                                          greater_is_better=self.greater_is_better,
                                                          needs_proba=False))
            self.model.fit(X, y)
            
        else:
            for param in ParameterGrid(self.params):
                self.model.set_params(**param)
                self.model.fit(X, y)
                val_pred = self.model.predict(val_X)
                score = self.eval_fun(val_pred, val_y)
                print(param, score)
                if self.best_params_ is None or ((self.best_score_ <= score) == self.greater_is_better):
                    self._best_model = self.model
                    self.best_params_ = param
                    self.best_score_ = score
            
            self.model = self._best_model
    
    def predict(self, X):
        return self.model.predict(X)
    
    def evaluation(self, test_y, pred):
        return self.model.evaluation(test_y, pred)

class CustomRegModel:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=20, max_depth=5, random_state=RANDOM_SEED, n_jobs=n_core)
    def train(self, X, y, params=None, val_X=None, val_y=None, param_types={}, category_params_dic={}):
        if params is not None:
            self.model = CustomHyperParameterTuner(self.model, params, 
                                                   official_numerical_score, 
                                                   greater_is_better=True, cv=5,
                                                   param_types=param_types, category_params_dic=category_params_dic)
        self.model.fit(X, y, val_X, val_y)
    def predict(self, X):
        self.predReal = self.model.predict(X)
        self.pred = [round(x) for x in self.predReal]
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['MSE'] = mean_squared_error(test_y, pred)
        self.ev['Official Numerical Score'] = official_numerical_score(test_y, pred)
        return self.ev
    def set_params(self, param):
        self.model.set_params(param)

class CustomClfModel:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=RANDOM_SEED, n_jobs=n_core)
    def train(self, X, y, params=None, val_X=None, val_y=None, param_types={}, category_params_dic={}):
        if params is not None:
            self.model = CustomHyperParameterTuner(self.model, params, 
                                                   official_categorical_score, 
                                                   greater_is_better=True, cv=5,
                                                   param_types=param_types, category_params_dic=category_params_dic)
        self.model.fit(X, y, val_X, val_y)
    def predict(self, X):
        self.pred = self.model.predict(X)
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['ACC'] = accuracy_score(test_y, pred)
        self.ev['Official Categorical Score'] = official_categorical_score(test_y, pred)
        return self.ev
    def set_params(self, param):
        self.model.set_params(param)

In [50]:
from queue import Queue
import random

class CustomHyperParameterTuner:
    def __init__(self, model, init_params, eval_fun, greater_is_better=True, cv=5, param_types={}, category_params_dic={}):
        self.model = model
        self.init_params = init_params
        self.eval_fun = eval_fun
        self.greater_is_better = greater_is_better
        self.cv = cv
        self._best_model = None
        self.best_params_ = None
        self.best_score_ = None
        self.exploration_rank = 1.1
        self.exploitation_rank = 3
        self.exploration_ratio = 0.9
        self.exploration_decay_ratio = 0.99
        self.random_rank = False
        self.max_step = 300
        self.param_types = param_types
        self.category_params_dic = category_params_dic
        
        self.step_history = set()

    def fit(self, X, y, val_X=None, val_y=None):
        if val_X is None or val_y is None:
            self.model = GridSearchCV(self.model, self.params, cv=self.cv,
                                      scoring=make_scorer(self.eval_fun,
                                                          greater_is_better=self.greater_is_better,
                                                          needs_proba=False))
            self.model.fit(X, y)
            
        else:
            param_queue = Queue()
            for param in ParameterGrid(self.init_params):
                param_queue.put(param)
            
            n_step = 0
            while((not param_queue.empty()) and n_step < self.max_step):
                param = param_queue.get()
                n_step += 1

                self.model.set_params(**param)
                try:
                    self.model.fit(X, y)
                except ValueError:
                    continue
                    
                val_pred = self.model.predict(val_X)
                score = self.eval_fun(val_pred, val_y)
                print(param, score)
                if self.best_params_ is None or ((self.best_score_ < score) == self.greater_is_better):
                    self._best_model = self.model
                    self.best_params_ = param
                    self.best_score_ = score
                    
                    for k in param:
                        new_param1 = param.copy()
                        new_param2 = param.copy()
                        
                        if self.param_types[k] == 'uint':
                            new_param1[k] = round(param[k] * self.exploration_rank)
                            if new_param1[k] == param[k]:
                                new_param1[k] += 1
                            new_param2[k] = round(param[k] / self.exploration_rank)
                            if new_param2[k] == param[k]:
                                new_param2[k] -= 1
                        elif self.param_types[k] == 'float':
                            new_param1[k] = round(param[k] * self.exploration_rank)
                            new_param2[k] = round(param[k] / self.exploration_rank)
                        elif self.param_types[k] == 'category':
                            tmp_category = self.category_params_dic[k].copy()
                            tmp_category.remove(param[k])
                            new_param1[k] = random.choice(tmp_category)

                        key1 = '/'.join([str(ob) for ob in new_param1.values()])
                        key2 = '/'.join([str(ob) for ob in new_param2.values()])
                            
                        if key1 not in self.step_history:
                            param_queue.put(new_param1)
                            self.step_history.add(key1)
                        if key2 not in self.step_history:
                            param_queue.put(new_param2)
                            self.step_history.add(key2)

                        if random.random() < self.exploration_ratio:
                            new_param1 = param.copy()
                            new_param2 = param.copy()
                            
                            if self.param_types[k] == 'uint':
                                new_param1[k] = round(param[k] * self.exploitation_rank)
                                if new_param1[k] == param[k]:
                                    new_param1[k] += 1
                                new_param2[k] = round(param[k] / self.exploitation_rank)
                                if new_param2[k] == param[k]:
                                    new_param2[k] -= 1
                            elif self.param_types[k] == 'float':
                                new_param1[k] = round(param[k] * self.exploitation_rank)
                                new_param2[k] = round(param[k] / self.exploitation_rank)
                        
                            key1 = '/'.join([str(ob) for ob in new_param1.values()])
                            key2 = '/'.join([str(ob) for ob in new_param2.values()])
                            
                            if key1 not in self.step_history:
                                param_queue.put(new_param1)
                                self.step_history.add(key1)
                            if key2 not in self.step_history:
                                param_queue.put(new_param2)
                                self.step_history.add(key2)
                
                self.exploration_ratio *= self.exploration_decay_ratio    
            
            self.model = self._best_model
    
    def predict(self, X):
        return self.model.predict(X)
    
    def evaluation(self, test_y, pred):
        return self.model.evaluation(test_y, pred)

In [51]:
def rule_based_model(df):
    df['요일'] = df['요일'].fillna('금')
    return df

In [52]:
test_data

Unnamed: 0,주야,요일,사망자수,사상자수,중상자수,경상자수,부상신고자수,발생지시도,발생지시군구,사고유형_대분류,사고유형_중분류,법규위반,도로형태_대분류,도로형태,당사자종별_1당_대분류,당사자종별_2당_대분류
0,야간,금,,,0.0,,0.0,경기,화성시,차대차,측면충돌,중앙선 침범,단일로,기타단일로,승용차,승합차
1,야간,금,,,0.0,,0.0,전남,영암군,차대사람,차도통행중,과속,단일로,기타단일로,승용차,보행자
2,야간,월,1.0,,,0.0,,전남,곡성군,차량단독,전도전복,안전운전 의무 불이행,단일로,기타단일로,자전거,없음
3,야간,일,2.0,,,1.0,,대구,달성군,차대차,측면충돌,중앙선 침범,단일로,기타단일로,승용차,승합차
4,주간,목,1.0,,,,0.0,전남,고흥군,차대차,정면충돌,중앙선 침범,단일로,기타단일로,화물차,화물차
5,주간,목,2.0,,,,0.0,경북,영천시,차대차,추돌,안전운전 의무 불이행,단일로,기타단일로,화물차,화물차
6,야간,수,1.0,,,,0.0,충남,아산시,차대차,추돌,안전거리 미확보,단일로,기타단일로,승합차,화물차
7,주간,월,,,,5.0,0.0,충남,서천군,차대차,추돌,안전운전 의무 불이행,단일로,기타단일로,승용차,특수차
8,주간,일,,,,21.0,4.0,강원,평창군,차대차,측면충돌,기타(운전자법규위반),단일로,기타단일로,건설기계,승합차
9,주간,수,,,,0.0,0.0,충북,음성군,차대사람,횡단중,과속,교차로,교차로내,승용차,보행자


In [53]:
model_meta_dict = {}
col_record = {}

for _, row in test_data.iterrows():
    target_col_list = []
    dependent_col_list = []
    for key, value in row.items():
        if isinstance(value, float) and np.isnan(value):
            target_col_list.append(key)
        else:
            dependent_col_list.append(key)
    
    for t_col in target_col_list:
        col_key = t_col + ':' + '/'.join(dependent_col_list)
        if col_record.get(col_key):
            continue
        model_dict = {}
        model_dict['target_col'] = t_col
        model_dict['dependent_col_list'] = dependent_col_list

        if val_data is not None:
            tmp_test_data = test_data.copy()
            tmp_test_data = refine_val(tmp_test_data, target_col_list, dependent_col_list)
            model_dict['val_data_loc'] = tmp_test_data.index
    
        model_meta_dict[col_key] = model_dict
        col_record[col_key] = 1

In [None]:
%%time

# main


save_dict = {}
loss_all_val1 = 0
if val_data is not None:
    loss_all_val2 = 0

tuned_parameters = {'n_estimators' : [20], 
                    'max_depth' : [5],
                    'max_features' : ['auto'],
                    'min_samples_split' : [2],
                    'min_samples_leaf' : [5],
                    'bootstrap' : [False]}

#tuned_parameters = {'n_estimators' : [10, 30, 100, 300, 1000, 3000]}
#tuned_parameters = None

param_types = {'n_estimators' : 'uint', 
               'max_depth' : 'uint',
               'max_features' : 'category',
               'min_samples_split' : 'uint',
               'min_samples_leaf' : 'uint',
               'bootstrap' : 'category'}

category_params_dic = {'max_features' : ['auto', 'sqrt'],
                       'bootstrap' : [True, False]}

for i, model_dict_key in enumerate(model_meta_dict):
    data = ori_data.copy()
    model_dict = model_meta_dict[model_dict_key]
    target_col = model_dict['target_col']
    dependent_col_list = model_dict['dependent_col_list']
    if val_data is not None:
        tmp_val_data = val_data.iloc[model_dict['val_data_loc']]
    result = {
        'target_col' : target_col,
        'dependent_col' : dependent_col_list
    }
    print(str(i+1) + '/' + str(len(model_meta_dict)))
    print('key : ', model_dict_key)
    print('loc : ', model_dict['val_data_loc'].values)
    print('tar_col : ', target_col)
    print('dep_col : ', dependent_col_list)

    data['val'] = 0

    if tmp_val_data is not None:
        tmp_val_data['val'] = 1
        data = data.append(tmp_val_data)

    X = data[dependent_col_list + ['val']]
    y = data[[target_col, 'val']]

    X = preprocessing(X)

    if tmp_val_data is not None:
        val_X = X[X['val'] == 1]
        val_y = y[y['val'] == 1]
        X = X[X['val'] == 0]
        y = y[y['val'] == 0]

        val_X = val_X.drop(['val'], axis=1)
        val_y = val_y[target_col].values

    X = X.drop(['val'], axis=1)
    y = y[target_col].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_SEED)
    
    if isinstance(y_test[0], str): # clf
        model = CustomClfModel()
    else:
        model = CustomRegModel()
    
    model.train(X_train, y_train, tuned_parameters, val_X, val_y, param_types, category_params_dic)
    print('best parameter : ', model.model.best_params_)
    pred = model.predict(X_test)
    val_eval = model.evaluation(pred, y_test)
    print('val1 : ', val_eval)
    result['val_1'] = val_eval
    official_score_key = [v for v in val_eval if 'Official' in v][0]
    loss_all_val1 += val_eval[official_score_key]

    if tmp_val_data is not None:
        val_pred = model.predict(val_X)
        val_eval = model.evaluation(val_pred, val_y)
        print('val2 : ', val_eval)
        result['val_2'] = val_eval
        official_score_key = [v for v in val_eval if 'Official' in v][0]
        loss_all_val2 += val_eval[official_score_key]

    print()
    save_dict[i] = result
    
    model_meta_dict[model_dict_key]['model'] = model

save_dict['loss_all_val1'] = loss_all_val1
save_dict['loss_all_val2'] = loss_all_val2

now_string = str(datetime.datetime.now())
now_string = now_string.replace(':', '-')
now_string = str(round(loss_all_val2, 4)).zfill(8) + ' ' + now_string
with open('result/' + now_string + '.json', 'w') as f:
    json.dump(save_dict, f)
with open('result/' + now_string + '.p', 'wb') as f:
    pickle.dump(model_dict, f)

1/48
key :  사망자수:주야/요일/중상자수/부상신고자수/발생지시도/발생지시군구/사고유형_대분류/사고유형_중분류/법규위반/도로형태_대분류/도로형태/당사자종별_1당_대분류/당사자종별_2당_대분류
loc :  [0 1]
tar_col :  사망자수
dep_col :  ['주야', '요일', '중상자수', '부상신고자수', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
{'bootstrap': False, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20} 1.9949825916323753
{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20} 1.997749262203663
{'bootstrap': False, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20} 1.9949825916323753
{'bootstrap': False, 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20} 1.9960492544458563
{'bootstrap': False, 'max_depth': 4, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20} 1

In [None]:
# check save file
with open('result/' + now_string + '.json', 'r') as f:
    json_ = json.load(f)
    
with open('result/' + now_string + '.p', 'rb') as f:
    model_ = pickle.load(f)

In [None]:
debug_log = model_meta_dict['사고유형_대분류:주야/요일/사망자수/사상자수/중상자수/경상자수/부상신고자수/발생지시도/발생지시군구/도로형태_대분류/도로형태/당사자종별_1당_대분류/당사자종별_2당_대분류']['val_data_loc']

In [47]:
model.model.model.get_params()

{'bootstrap': False,
 'criterion': 'mse',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 20,
 'n_jobs': 4,
 'oob_score': False,
 'random_state': 2018,
 'verbose': 0,
 'warm_start': False}

In [34]:
save_dict

{0: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '사망자수',
  'val_1': {'MSE': 0.070487220447284352,
   'Official Numerical Score': 4845.52826354399},
  'val_2': {'MSE': 0.0, 'Official Numerical Score': 2.0}},
 1: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '사상자수',
  'val_1': {'MSE': 0.93230830670926512,
   'Official Numerical Score': 4277.32209958561},
  'val_2': {'MSE': 0.5, 'Official Numerical Score': 1.3678794411714423}},
 2: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '경상자수',
  'val_1': {'MSE': 0.93130990415335