## 누락값 처리하기

In [4]:
import pandas as pd
import numpy as np

#### 누락값 확인하기

In [8]:
print(np.NaN == True)
print(np.NaN == False)
print(np.NaN == 0)
print(np.NaN == '')

False
False
False
False


In [10]:
print(np.NaN == np.NaN)
print(np.NaN == np.nan)
print(np.NaN == np.NAN)
print(np.nan == np.NAN)

False
False
False
False


In [11]:
print(pd.isnull(np.NaN))
print(pd.isnull(np.nan))
print(pd.isnull(np.NAN))

print(pd.notnull(np.NaN))
print(pd.notnull(42))
print(pd.notnull('missing'))

True
True
True
False
True
True


#### 누락값 생기는 이유
- 누락값 있는 데이터 집합 연결하기

In [19]:
visited = pd.read_csv('C:/STUDY/Pandas/survey_visited.csv')
survey = pd.read_csv('C:/STUDY/Pandas/survey_survey.csv')

In [16]:
vs = visited.merge(survey, left_on='ident', right_on='taken')
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


- 데이터 입력할 때

In [33]:
from numpy import nan, NaN

In [30]:
num_legs = pd.Series({'goat':4,'amoeba':nan})
num_legs

goat      4.0
amoeba    NaN
dtype: float64

In [34]:
scientists = pd.DataFrame({
    'Name':['Rosaline Franklin','William Gosset'],
    'Occupation':['Chemist','Statistician'],
    'Born':['1920-07-25','1876-06-13'],
    'missing' : [NaN, nan]})
scientists

Unnamed: 0,Name,Occupation,Born,missing
0,Rosaline Franklin,Chemist,1920-07-25,
1,William Gosset,Statistician,1876-06-13,


In [36]:
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        2 non-null      object 
 1   Occupation  2 non-null      object 
 2   Born        2 non-null      object 
 3   missing     0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 192.0+ bytes


#### 누락값 개수 확인

In [70]:
ebola = pd.read_csv('C:/STUDY/Pandas/country_timeseries.csv')
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [40]:
# count 메소드로 누락값 아닌 값의 개수 구하기
ebola.count()

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [46]:
# shape[0]에 전체 행 개수 -  값의 개수 = 누락값의 개수(브로드캐스팅)
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
num_missing

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [54]:
# count_nonzero = 배열에서 0(False)이 아닌 값의 개수 확인
ebola.isnull()
print(np.count_nonzero(ebola.isnull()))
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

1214
29


In [49]:
# value_counts() = 지정한 열의 빈도 확인
ebola['Cases_Guinea'].value_counts(dropna=False)

NaN       29
86.0       3
495.0      2
112.0      2
390.0      2
          ..
1199.0     1
1298.0     1
1350.0     1
1472.0     1
49.0       1
Name: Cases_Guinea, Length: 89, dtype: int64

#### 누락값 변경하기 - fill_na()

In [55]:
ebola.iloc[:,0:5].head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,,10030.0
1,1/4/2015,288,2775.0,,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,,8157.0,
4,12/31/2014,284,2730.0,8115.0,9633.0


In [56]:
ebola.fillna(0).iloc[:,0:5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,0.0,10030.0
1,1/4/2015,288,2775.0,0.0,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,0.0,8157.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0
...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0
118,3/26/2014,4,86.0,0.0,0.0
119,3/25/2014,3,86.0,0.0,0.0
120,3/24/2014,2,86.0,0.0,0.0


In [57]:
ebola['Cases_Guinea'].mean()

911.0645161290323

In [58]:
# ffill = nan 바로 앞의 값으로 채워짐
ebola.fillna(method='ffill').iloc[0:5,0:5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,,10030.0
1,1/4/2015,288,2775.0,,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,2769.0,8157.0,9722.0
4,12/31/2014,284,2730.0,8115.0,9633.0


In [59]:
ebola.fillna(method='bfill').iloc[0:5,0:5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,8166.0,10030.0
1,1/4/2015,288,2775.0,8166.0,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,2730.0,8157.0,9633.0
4,12/31/2014,284,2730.0,8115.0,9633.0


In [60]:
ebola['Cases_Guinea'].fillna(method='bfill').mean()

1005.139344262295

In [61]:
# interpolate()
ebola.interpolate().iloc[0:5,0:5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,,10030.0
1,1/4/2015,288,2775.0,,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,2749.5,8157.0,9677.5
4,12/31/2014,284,2730.0,8115.0,9633.0


#### 누락값 삭제하기 - drop_na()

In [67]:
ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)
print(ebola_dropna.iloc[0:5,0:5])  #nan 값 존재하는 행자체를 다 지움

(1, 18)
          Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
19  11/18/2014  241        2047.0         7082.0             6190.0


In [68]:
# ebola.fillna(method='ffill', inplace=True)

In [71]:
ebola = pd.read_csv('C:/STUDY/Pandas/country_timeseries.csv')

In [72]:
ebola['Cases_Guinea'].mean(skipna=False)

nan

In [73]:
ebola['Cases_Guinea']+ ebola['Cases_Liberia']

0          NaN
1          NaN
2      10935.0
3          NaN
4      10845.0
        ...   
117      111.0
118        NaN
119        NaN
120        NaN
121        NaN
Length: 122, dtype: float64