In [54]:
import numpy as np
import pandas as pd

url = 'https://www.data.go.kr/download/15034532/standard.do?dataType=csv'
raw_df = pd.read_csv(url, encoding='cp949')
column_select = ['시도명', '제한속도', 'CCTV설치여부', 'CCTV설치대수', '보호구역도로폭']
pd_data = raw_df[column_select]

In [55]:
print(pd_data.shape)
print(pd_data.dtypes)

(2518, 5)
시도명          object
제한속도          int64
CCTV설치여부     object
CCTV설치대수    float64
보호구역도로폭      object
dtype: object


In [56]:
print(pd_data['시도명'].hasnans)
print(pd_data['시도명'].unique())
# 결측치 / 오류치 / 이상치 없음


False
['서울특별시' '경상북도' '충청남도' '강원도' '경기도' '전라북도' '대전광역시' '경상남도' '인천광역시' '울산광역시'
 '부산광역시' '대구광역시' '충청북도' '전라남도' '광주광역시' '세종특별자치시' '제주특별자치도']


In [57]:
print(pd_data['제한속도'].hasnans)
print(pd_data['제한속도'].unique())
# 결측치 /오류치 / 이상치 없음


False
[30 50 60 40 38 39 41 42 43 44 45 46 47 48 49 51 52 53 54 55 31 32 33 34
 35 36 37 20]


In [58]:
print(pd_data['CCTV설치여부'].hasnans)
print(pd_data['CCTV설치여부'].unique())
# 결측치 /오류치 / 이상치 없음

False
['Y' 'N']


In [59]:
print(pd_data['CCTV설치대수'].hasnans)
print(pd_data['CCTV설치대수'].unique())

filter1 = (pd_data['CCTV설치여부'] == 'Y') & (pd_data['CCTV설치대수'].isna())
filter2 = (pd_data['CCTV설치여부'] == 'N') & (pd_data['CCTV설치대수'].isna())

pd_data.loc[filter1, 'CCTV설치대수'] = 1.0
pd_data.loc[filter2, 'CCTV설치대수'] = 0.0
# 결측치 : 설치대수가 nan 값이고 설치여부가 y , n으로 해서 1, 0으로 수정
#/오류치 / 이상치 없음

print(pd_data['CCTV설치대수'].hasnans)
print(pd_data['CCTV설치대수'].unique())

True
[ 1. nan  0.  2.  4.  3.  5.  8.  9.]
False
[1. 0. 2. 4. 3. 5. 8. 9.]


In [60]:
print(pd_data['보호구역도로폭'].hasnans)
print(pd_data['보호구역도로폭'].unique())
print(pd_data['보호구역도로폭'].isna().value_counts())

def myfn1(x):
    if type(x) == type(''):
        if '~' in x:
            m = np.array(x.split('~')).astype(np.float64).mean()
            return str(m)
        
y = pd_data.loc[:, '보호구역도로폭'].apply(myfn1).astype(np.float64)
pd_data.loc[:, '보호구역도로폭'] = y
pd_data.loc[pd_data['보호구역도로폭'].isna(), '보호구역도로폭'] = y.mean()


print(pd_data['보호구역도로폭'].hasnans)
print(pd_data['보호구역도로폭'].unique())
print(pd_data['보호구역도로폭'].isna().value_counts())
print(y.mean())

True
['3' '6' '7' '8' '9' nan '7.5' '6.5' '15' '11' '4' '20' '12' '13' '28'
 '20~34' '10' '15~20' '24' '25' '5' '14' '6~12' '16' '17~22' '4.5~5' '4.5'
 '20~23' '6~8' '33' '30' '12~13' '10~12' '8~9' '9~10' '4~30' '7~12' '35'
 '9.6~14.6' '14~25' '12~20' '21' '5~20' '1' '6~14' '5~7' '5~6' '7~11'
 '8~20' '4~8' '4~7' '18' '16~20' '5.5' '23' '6~10' '10~30' '6~7' '8~10'
 '36' '6~25' '22' '5.2' '10~15' '8.5' '6.5~12' '5~10' '0' '29' '5~8' '5~9'
 '33~35' '8 ~25' '7~9' '13~14' '5~11' '5~30' '5~15' '10~14' '26' '7~8'
 '4.5~6.5' '18~36' '5~16' '17' '10~35' '19' '8~12' '6~9' '7.5~12' '4~8.5'
 '5.0~11' '7.5~8' '8.5~25' '8.5~11' '6~13' '7~10' '40' '8~14' '20~30'
 '3~5' '8~35' '12~16' '13~15' '7~15' '5.7' '8.4' '3~4' '6.3~14.8' '2']
False    1484
True     1034
Name: 보호구역도로폭, dtype: int64
False
[10.00570866 27.         17.5         9.         19.5         4.75
 21.5         7.         12.5        11.          8.5         9.5
 17.         12.1        16.         10.          6.          5.5
 14.        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_data.loc[:, '보호구역도로폭'] = y
  pd_data.loc[:, '보호구역도로폭'] = y


In [61]:
q1, q3 = pd_data['보호구역도로폭'].quantile([0.25, 0.75])
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr
print(lower, upper)
# filter1 = pd_data['보호구역도로폭'] > upper
# print(filter1.value_counts())
# select_df.loc[filter1, '보호구역도로폭'] = np.nan

# print(select_df['보호구역도로폭'].unique())
# print(select_df['보호구역도로폭'].hasnans)

10.005708661417323 10.005708661417323


In [62]:
pd_data.to_csv('전국노인장애인도로표준데이터', encoding='cp949')