## 데이터 전처리

### 1. 결측치 처리
- 제거
- 채우기

In [1]:
import pandas as pd
import numpy as np

In [2]:
d = {'score1': [100,90,np.nan,95],
    'score2': [30,np.nan,45,56],
    'score3': [50,40,80,98],
    'score4': [np.nan,np.nan,np.nan,65]}

In [3]:
df = pd.DataFrame(d)
df.head()

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score1  3 non-null      float64
 1   score2  3 non-null      float64
 2   score3  4 non-null      int64  
 3   score4  1 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [5]:
df.isnull().sum()   # 빈값이 얼마나 있는지 확인

score1    1
score2    1
score3    0
score4    3
dtype: int64

In [6]:
df[df.score1.isnull()]

Unnamed: 0,score1,score2,score3,score4
2,,45.0,80,


#### 행을 기준으로 삭제

In [7]:
df.dropna()

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


In [8]:
df.dropna(axis=0, how='any')    # how='any' : 결측치가 있는 행 전부를 지우겠다.

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


#### 열을 기준으로 삭제

In [9]:
df.dropna(axis=1)

Unnamed: 0,score3
0,50
1,40
2,80
3,98


#### 행의 전체값이 결측치인 행을 삭제

In [10]:
d2 = {'score1': [100,np.nan,np.nan,95],
    'score2': [np.nan,np.nan,np.nan,np.nan],
    'score3': [52,np.nan,80,98],
    'score4': [np.nan,np.nan,np.nan,65]}

In [11]:
df2 = pd.DataFrame(d2)

In [12]:
df2

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
1,,,,
2,,,80.0,
3,95.0,,98.0,65.0


In [13]:
df2.dropna(how='all')          # how='all' 모든행이 null인 행 삭제

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
2,,,80.0,
3,95.0,,98.0,65.0


#### 임계치 설정해서 제거

In [14]:
df2.dropna(thresh=2)           # thresh: 원하는 갯수대로 null값 제거

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
3,95.0,,98.0,65.0


적당한 임계치는 분석가가 판단해서 잘 설정해야함

#### 특정한 열 안에서만 삭제

In [15]:
df.dropna(subset=['score2','score4'])

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


### 결측치 채우기

In [16]:
# 특정한 단일값으로 채우고 싶을때
df.fillna(0)

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,0.0
1,90.0,0.0,40,0.0
2,0.0,45.0,80,0.0
3,95.0,56.0,98,65.0


#### 결측치 바로 이전의 값으로 채우기

In [17]:
df.fillna(method='pad')

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


- 각 nan값들이 이전의 값들로 채워짐
- 앞의 행이 nan값이면 채워지지 않고 그대로 nan값을 가짐

#### 결측치 바로 뒤의 값으로 채우기

In [18]:
df.fillna(method='bfill')

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치를 각 열의 평균값으로 채우기

In [19]:
df.fillna(df.mean())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,65.0
1,90.0,43.666667,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치를 각 열의 중간값, 최소값,최대값 채우기

In [20]:
# 중간값
df.fillna(df.median())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [21]:
#최소값
df.fillna(df.min())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,65.0
1,90.0,30.0,40,65.0
2,90.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [22]:
# 최대값
df.fillna(df.max())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,65.0
1,90.0,56.0,40,65.0
2,100.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### replace() 함수로 결측치 채우기

In [23]:
df.replace(to_replace=np.nan, value=0)

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,0.0
1,90.0,0.0,40,0.0
2,0.0,45.0,80,0.0
3,95.0,56.0,98,65.0


- 지정한 값으로 결측치들이 모두 채워지게 됨(이 부분은 fillna랑 같음)
- replace 함수는 nan 값 대신에 특정한 문자를 다른 문자로 바꾸고 싶을 때 유용하게 사용됨

#### interpolate() 함수로 결측치 채우기

In [24]:
df.interpolate(method='linear', limit_direction='forward')  # 앞행과 뒤행의 중간값으로 채워줌

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,
1,90.0,37.5,40,
2,92.5,45.0,80,
3,95.0,56.0,98,65.0


In [25]:
df.interpolate(method='pad',limit =2)  

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,50,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


### 2. 범주형 데이터를 원핫인코딩으로 변경
- 모든 데이터를 0과 1 로 변환
- 기계는 모든데이터를 이진법으로 처리하기 때문에  레이블인코딩보다 원한잇코딩 사용

In [26]:
df = pd.read_csv('mushrooms.csv')
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [27]:
one = pd.get_dummies(df)

In [28]:
one

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


- get dummies의 문제점

   train 데이터에만 있고 test 데이터에는 없는 카체고리를 원한잇코딩된 컬럼으로     바꿔주지 못함

- sklearn OneHotEncoder 사용

In [29]:
x = df.iloc[:,1:]
y=df['class']

In [30]:
x

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [31]:
y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [32]:
from sklearn.preprocessing import OneHotEncoder

one = OneHotEncoder(sparse = False)
one.fit(x[['cap-shape']])

OneHotEncoder(sparse=False)

In [33]:
train_cap = one.transform(x[['cap-shape']])

In [34]:
train_cap

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [35]:
one.categories_

[array(['b', 'c', 'f', 'k', 's', 'x'], dtype=object)]

In [36]:
o = pd.DataFrame(train_cap, columns=['cap-shape_'+ col for col in one.categories_[0]])

In [37]:
o

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
8119,0.0,0.0,0.0,1.0,0.0,0.0
8120,0.0,0.0,0.0,0.0,0.0,1.0
8121,0.0,0.0,1.0,0.0,0.0,0.0
8122,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
one_x = pd.concat([x.drop(columns=['cap-shape']) ,o] ,axis=1)

In [39]:
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,s,n,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,s,y,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,s,w,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,y,w,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,s,g,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,s,n,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,s,n,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,s,n,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,y,n,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


In [40]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(one_x,y, random_state=0)

In [41]:
x_train

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
5832,y,y,f,f,f,c,b,p,e,b,...,l,h,y,g,0.0,0.0,1.0,0.0,0.0,0.0
601,y,n,t,l,f,c,b,w,e,r,...,p,n,y,g,0.0,0.0,0.0,0.0,0.0,1.0
1601,s,g,f,n,f,w,b,k,t,e,...,e,k,a,g,0.0,0.0,1.0,0.0,0.0,0.0
4941,f,g,f,f,f,c,b,g,e,b,...,l,h,v,p,0.0,0.0,0.0,0.0,0.0,1.0
7492,y,n,f,f,f,c,n,b,t,?,...,e,w,v,d,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,y,e,t,n,f,c,b,e,e,?,...,e,w,c,w,0.0,0.0,0.0,0.0,0.0,1.0
3264,f,g,f,f,f,c,b,h,e,b,...,l,h,y,p,0.0,0.0,0.0,0.0,0.0,1.0
1653,s,g,f,n,f,w,b,h,t,e,...,e,n,s,g,0.0,0.0,0.0,0.0,0.0,1.0
2607,f,n,t,n,f,c,b,n,t,b,...,p,n,v,d,0.0,0.0,1.0,0.0,0.0,0.0


In [42]:
y_train

5832    p
601     e
1601    e
4941    p
7492    p
       ..
4931    e
3264    p
1653    e
2607    e
2732    e
Name: class, Length: 6093, dtype: object

In [43]:
x_test

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
380,y,n,t,p,f,c,n,p,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
3641,y,g,t,n,f,c,b,p,t,b,...,p,k,y,d,0.0,0.0,1.0,0.0,0.0,0.0
273,y,y,t,a,f,c,b,k,e,c,...,p,n,n,m,0.0,0.0,0.0,0.0,0.0,1.0
1029,s,w,f,n,f,w,b,k,t,e,...,e,k,a,g,0.0,0.0,0.0,0.0,0.0,1.0
684,f,n,t,n,f,c,b,p,t,b,...,p,n,v,d,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1859,s,w,f,n,f,w,b,h,t,e,...,e,k,s,g,0.0,0.0,0.0,0.0,0.0,1.0
6181,s,n,f,s,f,c,n,b,t,?,...,e,w,v,d,0.0,0.0,1.0,0.0,0.0,0.0
6228,y,e,f,s,f,c,n,b,t,?,...,e,w,v,d,0.0,0.0,0.0,0.0,0.0,1.0
554,s,y,t,l,f,c,b,k,e,c,...,p,k,s,g,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
y_test

380     p
3641    e
273     e
1029    e
684     e
       ..
1859    e
6181    p
6228    p
554     e
7025    e
Name: class, Length: 2031, dtype: object

### 3. 범주특성을 레이블 인코딩으로 변화
- 숫자의 크고 작음에 대한 특성이 작용함
- 회귀와 같이 연속도니 실수를 다루는 알고리즘에서 레이블 인코딩을 사용하면 잘못된 결과값이 나올 수 있다.

In [45]:
one_x['cap-surface'].unique()

array(['s', 'y', 'f', 'g'], dtype=object)

In [46]:
# cap-surface의 유니크한 카테고리 데이터 개수 확인
one_x['cap-surface'].value_counts()

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

In [47]:
# 인코딩전
one_x['cap-surface']

0       s
1       s
2       s
3       y
4       s
       ..
8119    s
8120    s
8121    s
8122    y
8123    s
Name: cap-surface, Length: 8124, dtype: object

In [48]:
# 인코딩 후  - map사용
one_x['cap-surface'] = one_x['cap-surface'].map({'y':0,'s':1,'f':2,'g':3})

In [49]:
one_x['cap-surface']

0       1
1       1
2       1
3       0
4       1
       ..
8119    1
8120    1
8121    1
8122    0
8123    1
Name: cap-surface, Length: 8124, dtype: int64

In [53]:
from sklearn.preprocessing import LabelEncoder

In [55]:
encoder = LabelEncoder()
encoder.fit(one_x['cap-color'])

LabelEncoder()

In [57]:
one_x['cap-color'] = encoder.transform(one_x['cap-color'])

In [58]:
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,1,4,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,1,9,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,1,8,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,0,8,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,1,3,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,4,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,1,4,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,1,4,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,0,4,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


In [59]:
one_x['cap-color'].value_counts()

4    2284
3    1840
2    1500
9    1072
8    1040
0     168
5     144
1      44
7      16
6      16
Name: cap-color, dtype: int64