### 누락값 처리


*   대부분의 실제 데이터들은 정제되지 않고 누락값들이 존재
*   서로 다른 데이터들은 다른 형태의 결측을 가짐
*   결측 데이터는 null, NaN, NA로 표기


In [2]:
import pandas as pd
import numpy as np

#### None: 파이썬 누락 데이터

In [3]:
a = np.array([1, 2, None, 4, 5])
a

array([1, 2, None, 4, 5], dtype=object)

In [4]:
# a.sum()

TypeError: ignored

#### NaN: 누락된 수치 데이터

In [6]:
a = np.array([1, 2, np.nan, 4, 5])
a.dtype

dtype('float64')

In [10]:
0 + np.nan

nan

In [8]:
np.nan + np.nan

nan

In [9]:
a.sum(), a.min(), a.max()

(nan, nan, nan)

In [11]:
np.nansum(a), np.nanmin(a), np.nanmax(a)

(12.0, 1.0, 5.0)

In [13]:
pd.Series([1, 2, np.nan, 4, None])

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [14]:
s = pd.Series(range(5), dtype=int)
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [16]:
s[0] = None
s

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [17]:
s[3] = np.nan

In [19]:
s = pd.Series([True, False, None, np.nan])
s

0     True
1    False
2     None
3      NaN
dtype: object

#### Null 값 처리

In [21]:
s = pd.Series([1, 2, np.nan, 'String', None])
s

0         1
1         2
2       NaN
3    String
4      None
dtype: object

In [22]:
s.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [23]:
s[s.notnull()]

0         1
1         2
3    String
dtype: object

In [24]:
s.dropna()

0         1
1         2
3    String
dtype: object

In [26]:
df = pd.DataFrame({'a':np.random.randn(100),
                   'b':np.random.randn(100),
                   'c':np.random.randn(100)})
df

Unnamed: 0,a,b,c
0,0.565959,0.938298,-0.488098
1,-0.904615,1.607846,-0.858925
2,0.000982,-0.927009,-0.280384
3,-1.900124,0.695701,0.345090
4,-0.494950,0.396584,0.546067
...,...,...,...
95,-1.626816,-1.660603,-0.170808
96,-1.097631,0.630780,-0.102535
97,0.340152,0.885809,0.395879
98,-0.066972,-0.047738,-0.778017


In [27]:
df.dropna(axis='columns')

Unnamed: 0,a,b,c
0,0.565959,0.938298,-0.488098
1,-0.904615,1.607846,-0.858925
2,0.000982,-0.927009,-0.280384
3,-1.900124,0.695701,0.345090
4,-0.494950,0.396584,0.546067
...,...,...,...
95,-1.626816,-1.660603,-0.170808
96,-1.097631,0.630780,-0.102535
97,0.340152,0.885809,0.395879
98,-0.066972,-0.047738,-0.778017


In [28]:
df[3] = np.nan
df

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


In [29]:
df.dropna(axis='columns', how='all')

Unnamed: 0,a,b,c
0,0.565959,0.938298,-0.488098
1,-0.904615,1.607846,-0.858925
2,0.000982,-0.927009,-0.280384
3,-1.900124,0.695701,0.345090
4,-0.494950,0.396584,0.546067
...,...,...,...
95,-1.626816,-1.660603,-0.170808
96,-1.097631,0.630780,-0.102535
97,0.340152,0.885809,0.395879
98,-0.066972,-0.047738,-0.778017


In [30]:
df.dropna(axis='rows', thresh=3)    # 축을 기준으로 입력값이 3개 미만인 열 또는 행 삭제

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


In [31]:
s

0         1
1         2
2       NaN
3    String
4      None
dtype: object

In [32]:
s.fillna(0)

0         1
1         2
2         0
3    String
4         0
dtype: object

In [33]:
s.fillna(method='ffill')

0         1
1         2
2         2
3    String
4    String
dtype: object

In [34]:
s.fillna(method='bfill')

0         1
1         2
2    String
3    String
4      None
dtype: object

In [35]:
df

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


In [36]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


In [37]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,-0.488098
1,-0.904615,1.607846,-0.858925,-0.858925
2,0.000982,-0.927009,-0.280384,-0.280384
3,-1.900124,0.695701,0.345090,0.345090
4,-0.494950,0.396584,0.546067,0.546067
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,-0.170808
96,-1.097631,0.630780,-0.102535,-0.102535
97,0.340152,0.885809,0.395879,0.395879
98,-0.066972,-0.047738,-0.778017,-0.778017


In [38]:
df.fillna(method='bfill', axis=0)

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


In [39]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,a,b,c,3
0,0.565959,0.938298,-0.488098,
1,-0.904615,1.607846,-0.858925,
2,0.000982,-0.927009,-0.280384,
3,-1.900124,0.695701,0.345090,
4,-0.494950,0.396584,0.546067,
...,...,...,...,...
95,-1.626816,-1.660603,-0.170808,
96,-1.097631,0.630780,-0.102535,
97,0.340152,0.885809,0.395879,
98,-0.066972,-0.047738,-0.778017,


### 중복 제거

In [40]:
df = pd.DataFrame({'c1':['a', 'b', 'c'] *2 + ['b'] + ['c'],
                   'c2':[1, 2, 1, 1, 2, 3, 3, 4]})
df

Unnamed: 0,c1,c2
0,a,1
1,b,2
2,c,1
3,a,1
4,b,2
5,c,3
6,b,3
7,c,4


In [41]:
df.duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

In [42]:
df.drop_duplicates()

Unnamed: 0,c1,c2
0,a,1
1,b,2
2,c,1
5,c,3
6,b,3
7,c,4


### 값 치환

In [43]:
s = pd.Series([1., 2., -999., 3., -1000., 4.])
s

0       1.0
1       2.0
2    -999.0
3       3.0
4   -1000.0
5       4.0
dtype: float64

In [44]:
s.replace(-999, np.nan)

0       1.0
1       2.0
2       NaN
3       3.0
4   -1000.0
5       4.0
dtype: float64

In [45]:
s.replace([-999, -1000], np.nan)

0    1.0
1    2.0
2    NaN
3    3.0
4    NaN
5    4.0
dtype: float64

In [46]:
s.replace([-999, -1000], [np.nan, 0])

0    1.0
1    2.0
2    NaN
3    3.0
4    0.0
5    4.0
dtype: float64