In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

pd_rawdata = pd.read_csv('전국노인장애인보호구역표준데이터.csv', encoding = 'cp949')
column_selection = ['시도명', '제한속도', 'CCTV설치여부', 'CCTV설치대수', '보호구역도로폭']
pd_data = pd_rawdata[column_selection]

In [96]:
print(pd_data.shape)
print(pd_data.dtypes)

for e in pd_data:
    print(e, pd_data[e].hasnans)

(2518, 5)
시도명          object
제한속도          int64
CCTV설치여부     object
CCTV설치대수    float64
보호구역도로폭      object
dtype: object
시도명 False
제한속도 False
CCTV설치여부 False
CCTV설치대수 True
보호구역도로폭 True


In [97]:
# 시도명

print(pd_data['시도명'].unique())
print(pd_data['시도명'].hasnans)

# 결측치, 오류치, 이상치: 없음

['서울특별시' '경상북도' '충청남도' '강원도' '경기도' '전라북도' '대전광역시' '경상남도' '인천광역시' '울산광역시'
 '부산광역시' '대구광역시' '충청북도' '전라남도' '광주광역시' '세종특별자치시' '제주특별자치도']
False


In [98]:
# 제한속도

print(pd_data['제한속도'].unique())
print(pd_data['제한속도'].hasnans)

# 결측치, 오류치, 이상치: 없음

[30 50 60 40 38 39 41 42 43 44 45 46 47 48 49 51 52 53 54 55 31 32 33 34
 35 36 37 20]
False


In [99]:
# CCTV 설치여부

print(pd_data['CCTV설치여부'].unique())
print(pd_data['CCTV설치여부'].hasnans)

# 결측치, 오류치, 이상치: 없음

['Y' 'N']
False


In [100]:
# CCTV설치대수

print(pd_data['CCTV설치대수'].unique())
print(pd_data['CCTV설치대수'].hasnans)

# 결측치: np.NaN --> 0
na_filter = pd_data['CCTV설치대수'].isna()
print(na_filter.value_counts())
pd_data.loc[na_filter, 'CCTV설치대수'] = 0

# 오류치, 이상치: 없음

print(pd_data['CCTV설치대수'].unique())
print(pd_data['CCTV설치대수'].hasnans)

[ 1. nan  0.  2.  4.  3.  5.  8.  9.]
True
True     1551
False     967
Name: CCTV설치대수, dtype: int64
[1. 0. 2. 4. 3. 5. 8. 9.]
False


In [101]:
# 보호구역도로폭

print(pd_data['보호구역도로폭'].unique())
print(pd_data['보호구역도로폭'].hasnans)

# 결측치: np.NaN -> 0
na_filter = pd_data['보호구역도로폭'].isna()
print(na_filter.value_counts())
pd_data.loc[na_filter, '보호구역도로폭'] = '0'

print(pd_data['보호구역도로폭'].unique())
print(pd_data['보호구역도로폭'].hasnans)

def myfn1(x):
    if type(x) == type(' '):
        if '~' in x:
            m = np.array(x.split('~')).astype(np.float64).mean()
            return str(m)

    
# 오류치: '숫자a ~ 숫자b' 형태의 문자열 --> '숫자a'와 '숫자b'의 평균치(np.float64) 값으로 대체
print(pd_data['보호구역도로폭'].value_counts())
y = pd_data['보호구역도로폭'].apply(myfn1)
print(y.dtype)
print(y.value_counts())

y = y.astype(np.float64)
y = y.replace(np.NaN, y.mean())

pd_data['보호구역도로폭'] = y
na_filter = pd_data['보호구역도로폭'].isna()
print(na_filter.value_counts())

# 이상치: upper 보다 큰 값을 np.NaN으로 교체, 후에 dropna() 이용 샘플 삭제
q1, q3 = pd_data['보호구역도로폭'].quantile([0.25, 0.75])
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr
filter1 = pd_data['보호구역도로폭'] > upper
print(filter1.value_counts())
pd_data.loc[filter1, '보호구역도로폭'] = np.NaN
print(lower, upper)

print(pd_data['보호구역도로폭'].unique())
print(pd_data['보호구역도로폭'].hasnans)

['3' '6' '7' '8' '9' nan '7.5' '6.5' '15' '11' '4' '20' '12' '13' '28'
 '20~34' '10' '15~20' '24' '25' '5' '14' '6~12' '16' '17~22' '4.5~5' '4.5'
 '20~23' '6~8' '33' '30' '12~13' '10~12' '8~9' '9~10' '4~30' '7~12' '35'
 '9.6~14.6' '14~25' '12~20' '21' '5~20' '1' '6~14' '5~7' '5~6' '7~11'
 '8~20' '4~8' '4~7' '18' '16~20' '5.5' '23' '6~10' '10~30' '6~7' '8~10'
 '36' '6~25' '22' '5.2' '10~15' '8.5' '6.5~12' '5~10' '0' '29' '5~8' '5~9'
 '33~35' '8 ~25' '7~9' '13~14' '5~11' '5~30' '5~15' '10~14' '26' '7~8'
 '4.5~6.5' '18~36' '5~16' '17' '10~35' '19' '8~12' '6~9' '7.5~12' '4~8.5'
 '5.0~11' '7.5~8' '8.5~25' '8.5~11' '6~13' '7~10' '40' '8~14' '20~30'
 '3~5' '8~35' '12~16' '13~15' '7~15' '5.7' '8.4' '3~4' '6.3~14.8' '2']
True
False    1484
True     1034
Name: 보호구역도로폭, dtype: int64
['3' '6' '7' '8' '9' '0' '7.5' '6.5' '15' '11' '4' '20' '12' '13' '28'
 '20~34' '10' '15~20' '24' '25' '5' '14' '6~12' '16' '17~22' '4.5~5' '4.5'
 '20~23' '6~8' '33' '30' '12~13' '10~12' '8~9' '9~10' '4~30' '7~12' '35

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_data['보호구역도로폭'] = y


In [104]:
pd_data_f = pd_data.dropna()
print(pd_data.shape)
print(pd_data_f.shape)
pd_data_f.to_pickle('old.pkl')


(2518, 5)
(2453, 5)
