## What is a NaN value?

In [None]:
from numpy import NaN, NAN, nan

In [None]:
nan

In [None]:
NaN

In [None]:
NAN

In [None]:
# import numpy as np

In [None]:
nan == True

In [None]:
nan == False

In [None]:
nan == nan

In [None]:
import pandas as pd

In [None]:
pd.isnull(nan)

In [None]:
pd.isnull(42)

In [None]:
pd.isnull(NAN)

In [None]:
pd.isnull(NaN)

In [None]:
import numpy as np

In [None]:
np.nan

In [None]:
# missing values are automatically converted into NaN
pd.read_csv('survey_visited.csv')

In [None]:
# Other parameters for read_csv:
# na-values
# keep_default_na
# na_filter

pd.read_csv('survey_visited.csv', keep_default_na=False)

In [None]:
pd.read_csv('survey_visited.csv', na_values=[619, 622])

In [None]:
# NaN can also be created by merging DataFrames
# and user input

## Find and count missing data

In [None]:
ebola = pd.read_csv('country_timeseries.csv')
ebola.head()

In [None]:
# the info() method gives an overivew of missing values
ebola.info()

In [None]:
# this does basically the same
print(ebola.count())

In [None]:
# using vectorization to calculate the number of missing cases
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
print(num_missing)

In [None]:
import numpy as np

In [None]:
np.count_nonzero(ebola.isnull())

In [None]:
ebola.isnull()

In [None]:
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

In [None]:
ebola['Cases_Guinea']  #isnull()

In [None]:
# yet another way, is getting the frequency per value
# with value_count()
# with the dropna=False argument the NaN frequency is also shown

# get the first 5 value counts from the Cases_Guinea column
print(ebola.Cases_Guinea.value_counts(dropna=False).head())

## Cleaning Missing Data

In [None]:
# use the fillna() method to recode/replace to another value
# fillna has an inplace parameter which modifies the dataframe itself
ebola.fillna(0)#.iloc[:10, :5]
# ebola.fillna(0, inplace=True)
ebola

In [None]:
# fill forward
ebola.fillna(method='ffill').iloc[:10, :5]

In [None]:
## fill backward
ebola.fillna(method='bfill').iloc[:10, :5]

In [None]:
ebola.iloc[:10, :5]

In [None]:
ebola.interpolate().iloc[:10, :5]

In [None]:
ebola.interpolate?

In [None]:
# dropping missing values

ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)

In [None]:
ebola_dropna

## Calculations with missing data

In [None]:
ebola['Cases_multiple'] = ebola['Cases_Guinea'] + \
ebola['Cases_Liberia'] + \
ebola['Cases_SierraLeone']

In [None]:
ebola_subset = ebola.loc[:, ['Cases_Guinea', 'Cases_Liberia',
'Cases_SierraLeone', 'Cases_multiple']]
print(ebola_subset.head(n=10))

In [None]:
# calculations with missing data will typically return a missing value
# some functions can skip over missing values with the skipna parameter

# skipping missing values is True by default
print(ebola.Cases_Guinea.sum(skipna = True))

In [None]:
print(ebola.Cases_Guinea.sum(skipna = False))