##### <b> Missing Data </b></br> - Represented as NaN and treated as float so it can be used in vectorized operations

In [1]:
import numpy as np
import pandas as pd

In [2]:
# depending on the analysis being perform you may want the NaN value to be filled so operations can be performed that produce a result other than NaN

In [3]:
# Creation of a series that has a NaN value
# if the series is numerical, introducing a NaN will make the datatype float.
sales = pd.Series([0, 5, 155, np.nan, 518])
sales

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
dtype: float64

In [4]:
# performing operations on a series with a NaN does not affect the NaN. It will stay as NaN
sales + 2

0      2.0
1      7.0
2    157.0
3      NaN
4    520.0
dtype: float64

In [5]:
# if we don't want NaN then we can use the ( ,fill_value=) with the pandas method to add a value to NaNs
sales_nonull = sales.add(2, fill_value=0)
# this changes NaN to 0 and allows 2 to be added to it and can be assigned to a new series
sales_nonull

0      2.0
1      7.0
2    157.0
3      2.0
4    520.0
dtype: float64

##### <b> Identify Missing Data </b></br> The .isna() and .value_counts() method can identfiy missing data </br> - value_counts by default does not include na/NaN (use dropna=False) to include them

In [6]:
# creation of Series with 3 NaNs
checklist = pd.Series(['COMPLETE', np.nan, np.nan, np.nan, 'COMPLETE'])
checklist

0    COMPLETE
1         NaN
2         NaN
3         NaN
4    COMPLETE
dtype: object

In [7]:
# .isna() will return True (Value 1) if na/NaN. 
checklist.isna()

# This allows .sum() or .mean() operations to be done on the na/NaN values 
checklist.isna().sum()

3

In [8]:
# this can also be used as a boolean mask
mask = checklist.isna()
bool_checklist = checklist[mask]
bool_checklist

1    NaN
2    NaN
3    NaN
dtype: object

In [9]:
# .value_counts() method by default does not include na/NaN
checklist.value_counts()

COMPLETE    2
Name: count, dtype: int64

In [10]:
# to include na/NaN use dropna=False
checklist.value_counts(dropna=False)

NaN         3
COMPLETE    2
Name: count, dtype: int64

##### .dropna() removes NaN values from series or dataframe </br> when dropping rows, always use .reset_index() to fix index sequence

In [11]:
# drops na but only stays implemented if assigned to new series/dataframe 
checklist.dropna()

0    COMPLETE
4    COMPLETE
dtype: object

In [12]:
# drops na if assigned to new series/dataframe with .reset_index(drop=True) which drops previous index
checklist.dropna().reset_index(drop=True)

0    COMPLETE
1    COMPLETE
dtype: object

In [13]:
# fills na but only stays implemented if assigned to new series/dataframe 
# filles na with specific value. For categorical,k can use missing, incomplete... dataset dependant 
checklist.fillna("INCOMPLETE")

0      COMPLETE
1    INCOMPLETE
2    INCOMPLETE
3    INCOMPLETE
4      COMPLETE
dtype: object

##### be thoughtful and deliberate with how to handle missing data

In [16]:
# create index labels
items = ['coffee', 'coffee', 'tea', 'coconut', 'sugar']
# assign index labels
sales.index = items
sales

coffee       0.0
coffee       5.0
tea        155.0
coconut      NaN
sugar      518.0
dtype: float64

##### Understand why the data is missing can help decide what steps to take

In [17]:
# do you remove them?
sales.dropna()

coffee      0.0
coffee      5.0
tea       155.0
sugar     518.0
dtype: float64

In [18]:
# do you fille them with zero?
sales.fillna(0)

coffee       0.0
coffee       5.0
tea        155.0
coconut      0.0
sugar      518.0
dtype: float64

In [19]:
# do you impute them with the mean? -> Good for Machine Learning because it doesn't change the summary statistics
sales.fillna(sales.mean()) # Coconut NaN becomes 169.5

coffee       0.0
coffee       5.0
tea        155.0
coconut    169.5
sugar      518.0
dtype: float64