# Library

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.formula.api import glm
from statsmodels.genmod.families.family import Binomial
from sklearn.ensemble import RandomForestClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Data

In [60]:
train = pd.read_csv('PJT002_train.csv')
# val = pd.read_csv('PJT002_validation.csv')
# test = pd.read_csv('PJT002_test.csv')
# sub = pd.read_csv('PJT002_submission.csv')

# train 데이터의 모든 행의 null 값 개수

In [5]:
 train.isnull().sum().reset_index().sort_values(by = 0, ascending = False).values

array([['lw_13121011', 58926],
       ['lw_13121010', 58926],
       ['lw_13111110', 58918],
       ['lw_13101410', 58916],
       ['lw_13101110', 58908],
       ['lw_13141011', 58908],
       ['lw_13101310', 58907],
       ['lw_13141010', 58907],
       ['lw_13131010', 58903],
       ['lw_13131110', 58903],
       ['lw_13101210', 58902],
       ['lw_13101211', 58901],
       ['lw_13111010', 58899],
       ['lw_13101010', 58897],
       ['gas_engry_us_201802', 53825],
       ['gas_engry_us_201603', 53825],
       ['gas_engry_us_201712', 53825],
       ['gas_engry_us_201602', 53825],
       ['gas_engry_us_201601', 53825],
       ['gas_engry_us_201801', 53825],
       ['gas_engry_us_201512', 53825],
       ['gas_engry_us_201511', 53825],
       ['gas_engry_us_201609', 53825],
       ['gas_engry_us_201510', 53825],
       ['gas_engry_us_201509', 53825],
       ['gas_engry_us_201803', 53825],
       ['gas_engry_us_201508', 53825],
       ['gas_engry_us_201507', 53825],
       ['gas_engry_u

- ['bldng_us_clssfctn', 29343]건물용도분류명(주거용)   
- ['bldng_us', 27677]건물용도(단독주택)  
- ['bldng_archtctr', 27665]건물구조(벽돌구조)  
- ['dt_of_athrztn', 27581]건물승인일자(19850417)  
- ['bldng_ar_prc', 21895]단위 면적당 건물 가격_2019년 기준(312411)  
- ['ttl_dwn_flr', 11005]건물들의 지하 층수의 합(0)  
- ['ttl_grnd_flr', 10210]건물들의 지상 층수의 합(4)  
- ['lnd_us_sttn_nm', 1776]토지 이용상황명(전)
- ['rd_sd_nm', 1491]도로측면명(맹지)  
- ['rgnl_ar_nm2', 1491]용도지역지구명2(지정되지 않음)  
- ['rgnl_ar_nm', 1491]용도지역지구명(개발제한구역)  
- ['hm_cnt', 701]행정구역 인구(27312)  
- ['wnd_drctn', 221]풍향(270)
- ['fr_mn_cnt', 41]관할 소방서 인원(224)  
- ['wnd_spd', 33]풍속(2.2)
- ['hmdt', 22]습도(77)
- ['tmprtr', 13]온도_c(4.4)  
- ['emd_nm', 4]행정구역명(경상남도 진주시 판문동)  
__________________________________________________
- ['bldng_cnt_in_50m', 0]반경 50M 이내의 건물 수(23)  
- ['no_tbc_zn_dstnc', 0]금연구역과의 최소 거리(235)  
- ['fr_wthr_fclt_in_100m', 0]반경 100M 이내 소방용수 시설 수(2)  
- ['ahsm_dstnc', 0]자동 심장 충격기와의 최소 거리(234)  
- ['sft_emrgnc_bll_dstnc', 0]안전 비상벨과의 최소 거리(233)  
- ['tbc_rtl_str_dstnc', 0]담배 소매점과의 최소 거리(232)  
- ['cctv_in_100m', 0]반경 100M 이내 공공 CCTV(1)  
- ['mlt_us_yn', 0]다중이용시설 포함여부(Y)  
- ['cctv_dstnc', 0]공공 CCTV와의 최소 거리(232)  
- ['fr_wthr_fclt_dstnc', 0]소방용수시설(소화전 등)과의 최소 거리(223)  
- ['fr_sttn_dstnc', 0]119안전센터와의 거리(3222)  
- ['jmk', 0]지적상 지목(답)  
- [dt_of_fr', 0]화재발생일시(2014-03-18 15:23:18)  
- ['lnd_ar', 0]토지면적(390)  
- ['ttl_ar', 0]건물연면적_건물층별합계전체면적(130.845)  
- ['bldng_ar', 0]건물건축면적(130.845)  
- ['bldng_cnt', 0]건물채수(4)  

# 주소 없는 4개 행 삭제

In [61]:
train = train.dropna(subset=['emd_nm'])

# 주소 정리

In [62]:
train['시도'] = train['emd_nm'].apply(lambda x: str(x).split()[0])
train['시군구'] = train['emd_nm'].apply(lambda x: str(x).split()[1][:3])
train['구시동면읍'] = train['emd_nm'].apply(lambda x: str(x).split()[2])
# 창원시는 따로
train.loc[train['시군구'].apply(lambda x: len(x) > 3), '구시동면읍'] = train['emd_nm'].apply(lambda x: str(x).split()[1][3:])

train['주소'] = train['시도'] + ' ' + train['시군구'] + ' ' + train['구시동면읍']

# N, Y -> 0, 1

In [63]:
binary_y = {'N': 0, 'Y': 1}

train['fr_yn'] = train['fr_yn'].map(binary_y)
val['fr_yn'] = val['fr_yn'].map(binary_y)
train['mlt_us_yn'] = train['mlt_us_yn'].map(binary_y)
val['mlt_us_yn'] = val['mlt_us_yn'].map(binary_y)
test['mlt_us_yn'] = test['mlt_us_yn'].map(binary_y)

# 습도(hmdt) 채우기 위해 corr 확인

In [64]:
corr = train.corr()

In [65]:
corr.loc[corr['hmdt']<-0.2, 'hmdt']

wnd_spd     -0.402695
wnd_drctn   -0.253174
Name: hmdt, dtype: float64

# 년-월 행 추가

In [66]:
train['year-month'] = train['dt_of_fr'].apply(lambda x: x[:7])

# 년-월-일 행 추가

In [67]:
train['year-month-day']= train['dt_of_fr'].apply(lambda x: x[:10])

# 풍속 null 값 채우기(1차, 2차)

In [68]:
train['wnd_spd'] = train.groupby(['주소', 'year-month-day'])['wnd_spd'].transform(lambda x: x.fillna(x.median()))

In [70]:
train['wnd_spd'] = train.groupby(['주소', 'year-month'])['wnd_spd'].transform(lambda x: x.fillna(x.median()))

In [71]:
train['wnd_spd'].isnull().sum()

0

# 풍향 null 값 채우기(1차, 2차, 3차)

In [72]:
train['wnd_drctn'] = train.groupby(['주소', 'year-month-day'])['wnd_drctn'].transform(lambda x: x.fillna(x.median()))

In [76]:
train['wnd_drctn'] = train.groupby(['주소', 'year-month'])['wnd_drctn'].transform(lambda x: x.fillna(x.median()))

In [78]:
train['wnd_drctn'] = train.groupby(['year-month-day'])['wnd_drctn'].transform(lambda x: x.fillna(x.median()))

In [79]:
train['wnd_drctn'].isnull().sum()

0

# 습도 null 값 채우기(1차, 2차, 3차)

In [80]:
train['hmdt'] = train.groupby(['wnd_spd', 'wnd_drctn'])['hmdt'].transform(lambda x: x.fillna(x.median()))

In [84]:
train['hmdt'] = train.groupby(['wnd_drctn'])['hmdt'].transform(lambda x: x.fillna(x.median()))

In [93]:
train['hmdt'] = train.groupby(['year-month-day'])['hmdt'].transform(lambda x: x.fillna(x.median()))

In [94]:
train['hmdt'].isnull().sum()

0

# 온도 null 값 채우기

In [105]:
corr.loc[corr['tmprtr'] > 0.1, 'tmprtr']

tmprtr         1.000000
wnd_spd        0.106351
hmdt           0.125646
lw_13101110    0.145238
lw_13101210    0.118775
lw_13101211    0.136644
lw_13111010    0.116014
Name: tmprtr, dtype: float64

In [102]:
train['tmprtr'] = train.groupby(['wnd_spd', 'hmdt'])['tmprtr'].transform(lambda x: x.fillna(x.median()))

In [106]:
train['tmprtr'] = train.groupby(['hmdt', 'year-month-day'])['tmprtr'].transform(lambda x: x.fillna(x.median()))

In [108]:
train['tmprtr'] = train.groupby(['wnd_spd', 'year-month-day'])['tmprtr'].transform(lambda x: x.fillna(x.median()))

In [110]:
train['tmprtr'] = train.groupby(['hmdt', 'year-month'])['tmprtr'].transform(lambda x: x.fillna(x.median()))

In [111]:
train['tmprtr'].isnull().sum()

0