In [1]:
import import_ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder

In [2]:
#연령값을 나잇대별로 카테고리화
def get_category_age(age):
    if age<20: cat='미성년'
    elif age<35: cat='청년'
    elif age<50: cat='장년'
    elif age<65: cat='중년'
    else : cat='노년'
        
    return cat

In [3]:
def get_category_season(date):
    date = date[4:6]
    if date=='12' or date=='01' or date=='02': cat='Winter'
    elif date=='03' or date=='04' or date=='05': cat='Spring'
    elif date=='06' or date=='07' or date=='08': cat='Summer'
    else : cat='Autumn'
        
    return cat

In [4]:
def get_category_time(time):
    time = time[:2]
    if time=='00' or time=='01' or time=='02' or time=='03' or time=='04' or time=='05': cat='Dawn'
    elif time=='18' or time=='19' or time=='20' or time=='21' or time=='22' or time=='23': cat='Night'
    else : cat='Day'
        
    return cat

In [2]:
#쓸모없는 피처 제거
def drop_features(df):
    df.drop(['place_gu','acc_type_c', 'road_condition_a' ,'place_dong' , 'victim_injury', 'offender_injury', 'acc_type_a',
            'road_type_a'],
            axis=1, inplace=True)
    return df

In [6]:
# 불명/없음/null값을 가진 투플 제거 또는 기타로 분류, 나이는 연령별로 카테고리화
# 사고내용 사망 -> 중상으로 처리
def cleansing(df):        
    df = df.dropna(axis=0)    
    
    df['acc_details']=df['acc_details'].str.replace("사망","중상")
    df = df[df['acc_details'] != '부상신고']
    
    df = df.astype({'occur_date':'str'})
    df['occur_date']=df['occur_date'].apply(lambda x : get_category_season(x))   
    
    df['occur_time']=df['occur_time'].apply(lambda x : get_category_time(x))   
     
    #df = df[df['acc_type_a'] != '철길건널목']
    
    df['acc_type_b']=df['acc_type_b'].str.replace("전도","전도/전복") 
    df['acc_type_b']=df['acc_type_b'].str.replace("전복","전도/전복") 
    df['acc_type_b']=df['acc_type_b'].str.replace("전도전복","전도/전복")     
    df['acc_type_b']=df['acc_type_b'].str.replace("주/정차차량 충돌","기타")  
    df['acc_type_b']=df['acc_type_b'].str.replace("도로이탈","기타") 
    
    df['road_condition_b']=df['road_condition_b'].str.replace("침수","기타")    
    df['road_condition_b']=df['road_condition_b'].str.replace("해빙","기타") 
    df = df[df['road_condition_b'] != '기타']
    
    df['wheather_status']=df['wheather_status'].str.replace("안개","흐림")    
    df = df[df['wheather_status'] != '기타/불명']
    
    #df['road_type_a']=df['road_type_a'].str.replace("기타/불명","기타")   
    #df['road_type_a']=df['road_type_a'].str.replace("불명","기타")  
    #df['road_type_a']=df['road_type_a'].str.replace("철길건널목", "기타")
    
    df['road_type_b']=df['road_type_b'].str.replace("기타/불명","기타")   
    df['road_type_b']=df['road_type_b'].str.replace("철길건널목","기타")  
    df['road_type_b']=df['road_type_b'].str.replace("불명","기타")
    df = df[df['road_type_b'] != '기타']
    
    df['offender_vehicle']=df['offender_vehicle'].str.replace("불명","기타")
    df['offender_vehicle']=df['offender_vehicle'].str.replace("개인형이동수단(PM)","기타")
    df['offender_vehicle']=df['offender_vehicle'].str.replace("농기계","기타")
    df['offender_vehicle']=df['offender_vehicle'].str.replace("사륜오토바이(ATV)","기타")
    df = df[df['offender_vehicle'] != '기타']
    
    df = df[df['offender_sex'] != '기타불명']
    
    df = df[df['offender_age'] != '불명']

    df['victim_vehicle']=df['victim_vehicle'].str.replace("불명","기타")
    df['victim_vehicle']=df['victim_vehicle'].str.replace("개인형이동수단(PM)","기타")
    df['victim_vehicle']=df['victim_vehicle'].str.replace("농기계","기타")
    df['victim_vehicle']=df['victim_vehicle'].str.replace("사륜오토바이(ATV)","기타")
    df['victim_vehicle']=df['victim_vehicle'].str.replace("열차","기타")
    df = df[df['victim_vehicle'] != '기타']
    
    df = df[df['victim_sex'] != '없음']
    df = df[df['victim_sex'] != '기타불명']
    
    df = df[df['victim_age'] != '불명']
    
    #나잇값 카테고리화 (미성년, 청년, 중년, 장년, 노년)
    df['offender_age'] = df['offender_age'].str.replace("세","")
    df['victim_age'] = df['victim_age'].str.replace("세","")
    df = df.astype({'offender_age':'int', 'victim_age':'int'})
    
    df['offender_age']=df['offender_age'].apply(lambda x : get_category_age(x))
    df['victim_age']=df['victim_age'].apply(lambda x : get_category_age(x))        
    return df

In [7]:
#레이블 인코딩
def encode_features(org_df):
    encode_df = org_df.copy()
    
    for feature in encode_df.columns:
        le = LabelEncoder()
        le = le.fit(encode_df[feature])
        print(feature, '인코딩 클래스:', le.classes_)
        encode_df[feature] = le.transform(encode_df[feature])
    
    return encode_df

In [8]:
# 데이터프레임을 인코딩한 뒤 X,Y로 분리하고 ndarray로 변환하여 반환
def conv2XYarr(df):
    copy_df = df.copy()
    encoder =  LabelEncoder()
    
    y = copy_df.iloc[:,0].values
    y_encode = encoder.fit_transform(y)
    Y = pd.get_dummies(y_encode).values
    
    #copy_df = pd.get_dummies(copy_df, drop_first=True)
    copy_df = pd.get_dummies(copy_df, drop_first=False)
    X = copy_df.iloc[:,2:].values
    return X,Y, copy_df

In [9]:
def transform_dataframe(df):
    df = drop_features(df)
    df = cleansing(df)
    return df

In [10]:
def bar_chart(df, feature):
    slight = df[df['acc_details']=='경상'][feature].value_counts()
    serious = df[df['acc_details']=='중상'][feature].value_counts()
    df_ = pd.DataFrame([slight,serious])
    df_.index = ['경상','중상']
    df_.plot(kind='bar')

In [11]:
def pie_chart(df, feature): 
    feature_ratio = df[feature].value_counts(sort=False) 
    feature_size = feature_ratio.size 
    feature_index = feature_ratio.index 
    slight = df[df['acc_details'] == '경상'][feature].value_counts() 
    serious = df[df['acc_details'] == '중상'][feature].value_counts() 
    plt.plot(aspect='auto') 
    
    plt.pie(feature_ratio, labels=feature_index, autopct='%1.1f%%') 
    plt.title(feature + '\'s ratio in total') 
    plt.show() 
    
    for i, index in enumerate(feature_index): 
        plt.subplot(1, feature_size + 1, i + 1, aspect='equal') 
        plt.pie([slight[index], serious[index]], labels=['경상', '중상'], autopct='%1.1f%%') 
        plt.title(str(index)) 
    
    plt.show()