# 06. Missing Data
***

# 06-1 누락값이란?

## 누락값 확인하기

* 누락값은 NaN, NAN, nan과 같은 방법으로 표기
* 누락값을 파이썬에서 사용하려면 numpy 라이브러리 필요

In [3]:
from numpy import NaN, NAN, nan

In [None]:
print(NaN == True)

In [None]:
print(NaN == False)

In [None]:
print(NaN == 0)

In [None]:
print(NaN == '')

---

In [None]:
print(NaN == NaN)

In [None]:
print(NaN == nan)

In [None]:
print(NaN == NAN)

In [None]:
print(nan == NAN)

---

In [4]:
import pandas as pd 

print(pd.isnull(NaN))

True


In [None]:
print(pd.isnull(nan))

In [None]:
print(pd.isnull(NAN))

In [None]:
print(pd.notnull(NaN))

In [None]:
print(pd.notnull(42))

In [None]:
print(pd.notnull('missing'))

## 누락값이 생기는 이유 알아보기

### 1. 누락값이 있는 데이터 집합을 연결할 때 누락값이 생기는 경우

In [5]:
visited = pd.read_csv('../data/survey_visited.csv') 
survey = pd.read_csv('../data/survey_survey.csv')

print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [6]:
print(survey)

    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25


In [None]:
vs = visited.merge(survey, left_on='ident', right_on='taken') 
print(vs)

### 2. 데이터를 입력할 때 누락값이 생기는 경우

In [None]:
num_legs = pd.Series({'goat': 4, 'amoeba': nan}) 
print(num_legs)
print(type(num_legs))

In [None]:
scientists = pd.DataFrame({ 
    'Name': ['Rosaline Franklin', 'William Gosset'], 
    'Occupation': ['Chemist', 'Statistician'], 
    'Born': ['1920-07-25', '1876-06-13'], 
    'Died': ['1958-04-16', '1937-10-16'], 
    'missing': [NaN, nan]}) 

print(scientists)
print(type(scientists))

### 3. 범위를 지정하여 데이터를 추출할 때 누락값이 생기는 경우

In [10]:
gapminder = pd.read_csv('../data/gapminder.tsv', sep='\t')
print(gapminder)

          country continent  year  lifeExp       pop   gdpPercap
0     Afghanistan      Asia  1952   28.801   8425333  779.445314
1     Afghanistan      Asia  1957   30.332   9240934  820.853030
2     Afghanistan      Asia  1962   31.997  10267083  853.100710
3     Afghanistan      Asia  1967   34.020  11537966  836.197138
4     Afghanistan      Asia  1972   36.088  13079460  739.981106
...           ...       ...   ...      ...       ...         ...
1699     Zimbabwe    Africa  1987   62.351   9216418  706.157306
1700     Zimbabwe    Africa  1992   60.377  10704340  693.420786
1701     Zimbabwe    Africa  1997   46.809  11404948  792.449960
1702     Zimbabwe    Africa  2002   39.989  11926563  672.038623
1703     Zimbabwe    Africa  2007   43.487  12311143  469.709298

[1704 rows x 6 columns]


In [9]:
life_exp = gapminder.groupby(['year'])['lifeExp'].mean() 
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [None]:
print(life_exp.loc[range(2000, 2010), ])

In [None]:
y2000 = life_exp[life_exp.index > 2000] 
print(y2000)

## 누락값의 개수 구하기

In [None]:
ebola = pd.read_csv('../data/country_timeseries.csv')

In [None]:
print(ebola.count()) # count 메서드로 누락값이 아닌 값의 개수를 구한다

In [None]:
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count() 
print(num_missing)

In [None]:
import numpy as np 

print(np.count_nonzero(ebola.isnull()))

In [None]:
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

In [None]:
print(ebola.Cases_Guinea.value_counts(dropna=False).head())

## 누락값 처리하기 ― 변경, 삭제

### 1. 누락값 변경하기

In [None]:
print(ebola.fillna(0).iloc[0:10, 0:5])

In [None]:
print(ebola.fillna(method='ffill').iloc[0:10, 0:5])

In [None]:
print(ebola.fillna(method='bfill').iloc[0:10, 0:5])

In [None]:
print(ebola.interpolate().iloc[0:10, 0:5])

### 2. 누락값 삭제하기

In [None]:
print(ebola.shape)

In [None]:
ebola_dropna = ebola.dropna() 
print(ebola_dropna.shape)

In [None]:
print(ebola_dropna)

## 누락값이 포함된 데이터 계산하기

In [None]:
ebola['Cases_multiple'] = ebola['Cases_Guinea'] + ebola['Cases_Liberia'] + ebola['Cases_SierraLeone']

In [None]:
ebola_subset = ebola.loc[:, ['Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone', 'Cases_multiple']] 
print(ebola_subset.head(n=10))

In [None]:
print(ebola.Cases_Guinea.sum(skipna = True))

In [None]:
print(ebola.Cases_Guinea.sum(skipna = False))