### 1. 결측치 처리
- 제거
- 채우기

In [1]:
import pandas as pd
import numpy as np

In [2]:
d ={'score1':[100,90,np.nan,95],
   'score2':[30,np.nan,45,56],
   'score3':[52,40,80,98],
   'score4':[np.nan,np.nan,np.nan,65]}

In [3]:
df = pd.DataFrame(d)
df.head()

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [4]:
df.info()
#데이터 프레임의 정보 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score1  3 non-null      float64
 1   score2  3 non-null      float64
 2   score3  4 non-null      int64  
 3   score4  1 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [5]:
df.isnull().sum()
#각 feature들의 결측치 갯수

score1    1
score2    1
score3    0
score4    3
dtype: int64

In [6]:
df[df.score1.isnull()]
#원하는 열의 결측치를 확인하고 싶을 때

Unnamed: 0,score1,score2,score3,score4
2,,45.0,80,


#### 행기준 삭제

In [7]:
df.dropna()
#행 기준으로 결측치가 있는 행을 모두 삭제

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


In [8]:
df.dropna(axis=0,how='any')
#행기준 삭제코드에 파라미터도 줄 수 있다. 

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


#### 열 기준 삭제

In [9]:
df.dropna(axis=1)
#열 기준으로 삭제 (axis=1)

Unnamed: 0,score3
0,52
1,40
2,80
3,98


#### 행의 전체 값이 결측인 행을 삭제

In [10]:
d2 = {'score1':[100,np.nan,np.nan,95],
     'score2':[np.nan,np.nan,np.nan,np.nan],
     'score3':[52,np.nan,80,98],
     'score4':[np.nan,np.nan,np.nan,60]}

In [11]:
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
1,,,,
2,,,80.0,
3,95.0,,98.0,60.0


In [12]:
df2.dropna(how='all')
#how 마라미터가 all일 경우 전체값이 결측치인 행 or 열만 삭제

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
2,,,80.0,
3,95.0,,98.0,60.0


#### 임계치 설정해서 제거
- 임계치 = 기준값과 유사

In [13]:
df2.dropna(thresh=2) 
#적당한 임계값이 얼마인지 판단을 데이터 분석가가 잘 해야함 
#thresh = 2  => nan 값이 2개보다 많은 행을 삭제

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
3,95.0,,98.0,60.0


#### 특정 열 안에서 삭제

In [14]:
df.dropna(subset=['score2','score4'])
#score 2와 score4 두개다 NaN값이 없는 행만 남김

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


### 결측치 채우기

#### 특정한 단일값으로 채우기

In [15]:
df.fillna(0)
#df,fillna(n) = 결측치를 n 값으로 채우기 

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,0.0
1,90.0,0.0,40,0.0
2,0.0,45.0,80,0.0
3,95.0,56.0,98,65.0


In [16]:
df.fillna(method='pad')
#결측치를 이전 행의 값으로 채운다.
#4열같은 경우는 채울수가 없어서 NaN으로 남음

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


In [17]:
df.fillna(method='bfill')
#결측치를 다음 행의 값으로 채운다.

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치를 각 열의 평균값으로 채우기

In [18]:
df.fillna(df.mean())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,43.666667,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치- 중간값, 최소값,최대값으로 채우기 

In [19]:
df.describe()

Unnamed: 0,score1,score2,score3,score4
count,3.0,3.0,4.0,1.0
mean,95.0,43.666667,67.5,65.0
std,5.0,13.051181,26.350206,
min,90.0,30.0,40.0,65.0
25%,92.5,37.5,49.0,65.0
50%,95.0,45.0,66.0,65.0
75%,97.5,50.5,84.5,65.0
max,100.0,56.0,98.0,65.0


In [20]:
df.fillna(df.median())
#중간값

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [21]:
df.fillna(df.min())
#최소값

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,30.0,40,65.0
2,90.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [22]:
df.fillna(df.max())
#최대값

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,56.0,40,65.0
2,100.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### replace() 함수로 결측치 채우기

In [23]:
df.replace(to_replace = np.nan, value=10)
#df.replace(to_replace=n,value=m):n을 m으로 대체
#문자로도 대체 가능 

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,10.0
1,90.0,10.0,40,10.0
2,10.0,45.0,80,10.0
3,95.0,56.0,98,65.0


#### interploate() 함수로 결측치 채우기
- 선형 방법을 사용해서 결측값 채워줌
- 선형방식은 인덱스를 무시하고 값들을 같은 간격으로 처리함.

In [24]:
df.interpolate(method='linear',limit_direction='forward')
#이전행과 이후 행의 중앙값

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,37.5,40,
2,92.5,45.0,80,
3,95.0,56.0,98,65.0


In [25]:
df.interpolate(method='pad')
#fillna와 같은 결과

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


### 2. 범주형 특성을 one-hot encoding으로 변환
- 모든 데이터를 0과 1로 변환
- 컴퓨터는 모든 데이터를 이진법으로 처리하기 때문에 레이블 인코딩보다는 원핫인코딩을 주로 사용

In [26]:
df = pd.read_csv('mushrooms.csv')

In [27]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [28]:
one = pd.get_dummies(df)
#원핫인코딩

In [29]:
one

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


- pandas.get_dummies의 문제점

train 데이터에만 있고 test데이터에는 없는 카테고리를 원핫인코딩된 칼럼으로 바꿔주지 못한다.

### sklearn OneHotEncoder 사용

In [30]:
x= df.iloc[:, 1:]
y= df['class']

In [31]:
y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
one= OneHotEncoder(sparse=False)
train_cat= one.fit_transform(x[['cap-shape']])

In [34]:
train_cat

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [35]:
one.categories_

[array(['b', 'c', 'f', 'k', 's', 'x'], dtype=object)]

In [36]:
o = pd.DataFrame(train_cat,
                columns=['cap-shape_'+col for col in one.categories_[0]])
#one-hot encoding

In [37]:
o

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
8119,0.0,0.0,0.0,1.0,0.0,0.0
8120,0.0,0.0,0.0,0.0,0.0,1.0
8121,0.0,0.0,1.0,0.0,0.0,0.0
8122,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
one_x = pd.concat([x.drop(columns=['cap-shape']),o],axis=1)

In [39]:
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,s,n,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,s,y,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,s,w,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,y,w,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,s,g,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,s,n,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,s,n,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,s,n,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,y,n,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


### 범주특성을 레이블 인코딩으로 변환
- 숫자의 크고 작음에 대한 특성이 작용
- 회귀와 같이 연속된 실수를 다루는 알고리즘에서 숫자 크기에 따른 순서나 중요도로 인식될 수 있어서 잘못된 결과가 나올 수 있음 

In [40]:
one_x['cap-surface'].unique()

array(['s', 'y', 'f', 'g'], dtype=object)

In [41]:
one_x['cap-surface'].value_counts()

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

In [42]:
one_x['cap-surface']

0       s
1       s
2       s
3       y
4       s
       ..
8119    s
8120    s
8121    s
8122    y
8123    s
Name: cap-surface, Length: 8124, dtype: object

In [43]:
one_x['cap-surface'] = one_x['cap-surface'].map({'y': 0,'s':1,'f':2,'g':3})

In [44]:
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,1,n,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,1,y,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,1,w,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,0,w,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,1,g,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,n,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,1,n,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,1,n,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,0,n,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


#### sklearn LabelEncoder

In [45]:
from sklearn.preprocessing import LabelEncoder

In [46]:
encoder = LabelEncoder()
encoder.fit(one_x['cap-color'])
one_x['cap-color'] = encoder.transform(one_x['cap-color'])

In [47]:
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,1,4,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,1,9,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,1,8,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,0,8,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,1,3,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,4,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,1,4,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,1,4,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,0,4,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


#### 데이터 불균형 해소

In [49]:
y.value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [50]:
e = y.loc[y=='e']
p = y.loc[y=='p']

In [51]:
e

1       e
2       e
4       e
5       e
6       e
       ..
8115    e
8119    e
8120    e
8121    e
8123    e
Name: class, Length: 4208, dtype: object

In [52]:
p

0       p
3       p
8       p
13      p
17      p
       ..
8114    p
8116    p
8117    p
8118    p
8122    p
Name: class, Length: 3916, dtype: object

In [53]:
e = e[:1000]
p = p[:1000]

In [54]:
y = pd.concat([e,p],axis=0)
y

1       e
2       e
4       e
5       e
6       e
       ..
4369    p
4370    p
4371    p
4372    p
4373    p
Name: class, Length: 2000, dtype: object