In [8]:
# define function

from math import exp

def official_numerical_score(pred, real, B=1, s=1):
    return B * sum([exp(-((n-m)/s)**2) for n, m in zip(pred, real)]) / len(pred)

def official_categorical_score(pred, real, C=1):
    return C * sum([1 if ci == di else 0 for ci, di in zip(pred, real)]) / len(pred)

# data load
import warnings
warnings.filterwarnings('ignore')
import json
import pickle
import datetime

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

data = pd.read_csv('dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', encoding='euc-kr', engine='python')
val_data = pd.read_csv('val_refine.csv', encoding='euc-kr', engine='python')
test = pd.read_csv('test_kor.csv', encoding='euc-kr', engine='python')

train_data = data.dropna()
target_col_list = [i.strip() for i in 
                   '사상자수, 사망자수, 중상자수, 경상자수, 부상신고자수, 주야, 요일, 발생지시도, 발생지시군구, \
                   사고유형_대분류, 사고유형_중분류, 법규위반, 도로형태_대분류, 도로형태, \
                   당사자종별_1당_대분류, 당사자종별_2당_대분류'.split(',')]

In [9]:
def preprocessing(data):
    onehot_col = []

    for col in data.columns:
        if isinstance(data[col].values[0], str):
            onehot_col.append(col)

    data = pd.get_dummies(data, prefix=onehot_col)
    return data

class CustomRegModel:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=20, max_depth=5, random_state=0)
    def train(self, X, y):
        self.model.fit(X, y)
    def predict(self, X):
        self.predReal = self.model.predict(X)
        self.pred = [round(x) for x in self.predReal]
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['MSE'] = mean_squared_error(test_y, pred)
        self.ev['Official Numerical Score'] = official_numerical_score(test_y, pred)
        return self.ev

class CustomClfModel:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=0)
    def train(self, X, y):
        self.model.fit(X, y)
    def predict(self, X):
        self.pred = self.model.predict(X)
        return self.pred
    def evaluation(self, test_y, pred):
        self.ev = {}
        self.ev['ACC'] = accuracy_score(test_y, pred)
        self.ev['Official Categorical Score'] = official_categorical_score(test_y, pred)
        return self.ev

In [16]:
# 필요 모델 리스팅
model_dict = {}

for i, row in test.iterrows():
    target_col_list = []
    dependent_col_list = []
    for key, value in row.items():
        if isinstance(value, float) and np.isnan(value):
            target_col_list.append(key)
        else:
            dependent_col_list.append(key)
    
    for t_col in target_col_list:
        model_name = t_col + ':' + '/'.join(dependent_col_list)
        model_dict[model_name] = ''

save_dict = {}
loss_all_val1 = 0
if val_data is not None:
    loss_all_val2 = 0

for i, model_name in enumerate(model_dict):
    target_col, dependent_col_list_string = model_name.split(':')
    dependent_col_list = dependent_col_list_string.split('/')
    result = {
        'target_col' : target_col,
        'dependent_col' : dependent_col_list
    }
    print('tar_col : ', target_col)
    print('dep_col : ', dependent_col_list)

    data['val'] = 0

    if val_data is not None:
        val_data['val'] = 1
        data = data.append(val_data)

    X = data[dependent_col_list + ['val']]
    y = data[[target_col, 'val']]

    X = preprocessing(X)

    if val_data is not None:
        val_X = X[X['val'] == 1]
        val_y = y[y['val'] == 1]
        X = X[X['val'] == 0]
        y = y[y['val'] == 0]

        val_X = val_X.drop(['val'], axis=1)
        val_y = val_y.drop(['val'], axis=1)

    X = X.drop(['val'], axis=1)
    y = y.drop(['val'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    if isinstance(y_test[target_col].values[0], str): # clf
        model = CustomClfModel()
    else:
        model = CustomRegModel()
    model.train(X_train, y_train)
    pred = model.predict(X_test)
    val_eval = model.evaluation(pred, y_test.values)
    print('val1 : ', val_eval)
    result['val_1'] = val_eval
    loss_all_val1 = val_eval

    if val_data is not None:
        val_pred = model.predict(val_X)
        val_eval = model.evaluation(val_pred, val_y.values)
        print('val2 : ', val_eval)
        result['val_2'] = val_eval
        loss_all_val2 = val_eval

    print()
    save_dict[i] = result
    model_dict[model_name] = model

save_dict['loss_all_val1'] = loss_all_val1
save_dict['loss_all_val2'] = loss_all_val2

now_string = str(datetime.datetime.now())
with open('result/' + now_string + '.json', 'w') as f:
    json.dump(save_dict, f)
with open('result/' + now_string + '.p', 'wb') as f:
    pickle.dump(model_dict, f)

tar_col :  사망자수
dep_col :  ['주야', '요일', '중상자수', '부상신고자수', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
val1 :  {'MSE': 0.055366004962779158, 'Official Numerical Score': 0.9809006664253471}
val2 :  {'MSE': 0.02, 'Official Numerical Score': 0.9873575888234288}

tar_col :  사상자수
dep_col :  ['주야', '요일', '중상자수', '부상신고자수', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
val1 :  {'MSE': 2.033756580984825, 'Official Numerical Score': 0.7898950101692348}
val2 :  {'MSE': 0.95999999999999996, 'Official Numerical Score': 0.6416282197301032}

tar_col :  경상자수
dep_col :  ['주야', '요일', '중상자수', '부상신고자수', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
val1 :  {'MSE': 1.0777674706246134, 'Official Numerical Score': 0.8544114016268608}
val2 :  {'MSE': 0.47999999999999998, 'Official Numerical Score': 0.7584540467304108}

tar_col :  사상자수
dep_c

In [5]:
save_dict

{0: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '사망자수',
  'val_1': {'MSE': 0.061953352769679303,
   'Official Numerical Score': 0.9780375489674865},
  'val_2': {'MSE': 0.040000000000000001,
   'Official Numerical Score': 0.9747151776468577}},
 1: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '사상자수',
  'val_1': {'MSE': 2.3761367770098216,
   'Official Numerical Score': 0.8204424092030121},
  'val_2': {'MSE': 1.74, 'Official Numerical Score': 0.6269130443342269}},
 2: {'dependent_col': ['주야',
   '요일',
   '중상자수',
   '부상신고자수',
   '발생지시도',
   '발생지시군구',
   '사고유형_대분류',
   '사고유형_중분류',
   '법규위반',
   '도로형태_대분류',
   '도로형태',
   '당사자종별_1당_대분류',
   '당사자종별_2당_대분류'],
  'target_col': '경상자

In [17]:
with open('result/' + now_string + '.json', 'r') as f:
    json_ = json.load(f)
    
with open('result/' + now_string + '.p', 'rb') as f:
    model_ = pickle.load(f)
