# 데이터 전처리 이해와 실무

### 데이터 전처리 과정 내 주의 사항
* 데이터 전처리 과정 진행 시에는 원본 데이터 Copy 필수
* Python은 Copy 함수를 사용하지 않으면, 원본 데이터 값을 변경시킴
* 만일 전처리 내역이 변경되는 경우, 데이터 로딩부터 모든 과정을 다시 시작해야함
* 따라서 원본 데이터와 전처리 과정을 진행할 데이터를 구분하여 작업 수행

In [1]:
# 필요 라이브러리 로딩
import numpy as np
import pandas as pd

In [3]:
# 데이터 로딩 및 개요 확인
cancer = pd.read_csv('../data/wdbc.data', header=None)
cancer

# 컬럼이 없는 데이터 파일이므로, 데이터 설명 링크 내 7. Attribute information 참조
# https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic/

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [None]:
# 데이터 컬럼명 
cancer_name = open('../data/wdbc.names')
name_list = cancer_name.readlines()
for i in name_list:
    print(i)

In [11]:
# 데이터 컬럼명 지칭 -> 너무 많아
# cancer.columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean'...]

# ID를 index화
cancer = cancer.set_index(0) # 컬럼명 지칭했다면, cancer.set_index('id')
cancer

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [15]:
# 데이터 복사
cancer_data = cancer.copy()

# 데이터 내 결측치 생성
# 실습을 위한 일부 데이터 선택
cancer_data = cancer_data[:30] # 30개 행만 선택
cancer_data = cancer_data[[1,2,3,4,5]] # fancy index로 컬럼 5개 선택
cancer_data

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,M,19.69,21.25,130.0,1203.0
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,M,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [18]:
# 결측치 생성
# 6개 record 내 결측치 생성
cancer_data.iloc[2,:] = np.nan # 3행 내 모든 데이터 결측치 생성
cancer_data.iloc[5,0] = np.nan # 6행 내 1열 데이터 결측치 생성
cancer_data.iloc[10,[3,4]] = np.nan # 11행 내 4,5열 결측치 생성
cancer_data.iloc[12,2:4] = np.nan # 13행 내 3,4열 결측치 생성
cancer_data.iloc[15,[0,3]] = np.nan # 16행 내 1, 4 열 결측치 생성
cancer_data.iloc[24,4] = np.nan # 25행 5열 결측치 생성

cancer_data

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


### 결측치 제거하기

#### listwise

In [19]:
# 데이터 내 1개 변수 값에서 N/A(결측)이 존재하는 경우, 해당 행 제거
# 데이터 개요
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       27 non-null     object 
 1   2       29 non-null     float64
 2   3       28 non-null     float64
 3   4       26 non-null     float64
 4   5       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [21]:
# listwise delection 수행
# 30개 record 중, 6개 record에서 결측치 존재함
cancer_copy = cancer_data.copy()
cancer_copy = cancer_copy.dropna()

# 데이터 요약 : 총 30개 record 중 6개 record 삭제
print(cancer_copy.info())

# 데이터 차원 확인
cancer_copy.shape # np.shape(cancer_copy) 도 가능

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       24 non-null     object 
 1   2       24 non-null     float64
 2   3       24 non-null     float64
 3   4       24 non-null     float64
 4   5       24 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.1+ KB
None


(24, 5)

#### pairwise

In [24]:
# 모든 열의 데이터가 결측치인 경우만 삭제
# 데이터 개요
cancer_copy2 = cancer_data.copy()
cancer_copy2 = cancer_copy2.dropna(how='all')
cancer_copy2.info() # 1개만 삭제됨

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       27 non-null     object 
 1   2       29 non-null     float64
 2   3       28 non-null     float64
 3   4       26 non-null     float64
 4   5       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [25]:
np.shape(cancer_copy2)

(29, 5)

### 결측치 대체하기

In [29]:
# 결측치 데이터 확인 
cancer_copy = cancer_data.copy()
cancer_copy

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


#### 일정 값 대체

In [30]:
# 일정 값 대체 수행
# diagnosis 컬럼 내 결측치는 C라는 범주형 값 일괄 대체
# 컬럼명을 지정해주지 않았으므로 1번 컬럼으로 하겠음
cancer_copy[1] = cancer_copy[1].fillna('C')
cancer_copy.head(10) # 3열, 6열 이 대체됨

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [31]:
# 수치형 컬럼인 radius_mean(2번 컬럼) 내 결측치는 65라는 일정값으로 대체
cancer_copy[2] = cancer_copy[2].fillna(65)
cancer_copy.head(10)

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,65.0,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [32]:
# 데이터 개요 확인
cancer_copy.info()

# 1번과 2번 컬럼은 대체를 수행했기에 결측치가 없음
# 나머지 컬럼 결측치 개수 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       30 non-null     object 
 1   2       30 non-null     float64
 2   3       28 non-null     float64
 3   4       26 non-null     float64
 4   5       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [38]:
# 일정 값을 지정 값이 아닌 컬럼의 평균으로 대체(평균, 중앙, 최소, 최대, 최빈 등)
# texture_mean(3번) 컬럼 내 결측치를 해당 series의 평균값으로 대체

cancer_copy[3] = cancer_copy[3].replace(np.nan, cancer_copy[3].mean())
cancer_copy.head(10)

# fillna() 와 동일 결과
# cancer_copy[3] = cancer_copy[3].fillna(cancer_copy[3].mean())

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,65.0,19.397143,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


#### 선형 값 대체

In [39]:
# 데이터 앞 뒤 record 값을 기반으로 결측치 대체 (선형 보간법)
cancer_copy = cancer_data.copy()
cancer_copy.head()

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [40]:
# 선형 보간법
# 법주형( Male, Female 등)이 아닌 연속형 데이터인 경우에만 선형 보간법이 적용될 수 있음
cancer_copy = cancer_copy.interpolate()
cancer_copy.head()

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,15.995,19.075,105.24,856.05
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [41]:
# 선형 값 대체 확인
print((cancer_copy.iloc[1,1]+cancer_copy.iloc[3,1]) /2)

15.995000000000001
