In [1]:
### This drill covers Missing Data from code and labels it `NaN` 

In [2]:
import pandas as pd
import numpy as np

In [3]:
apple_df = pd.DataFrame({
        "AAPL": [72.27, np.nan, 74.39, np.nan, 
                 75.94, 76.93,np.nan, 78.74, np.nan, 
                 79.81, np.nan, 79.53, np.nan, 79.56, 79.49]
    })

In [4]:
apple_df

Unnamed: 0,AAPL
0,72.27
1,
2,74.39
3,
4,75.94
5,76.93
6,
7,78.74
8,
9,79.81


In [5]:
apple_df.isnull().sum() # This line shows number of missing data from list provided above. 

AAPL    6
dtype: int64

In [6]:
### This section will cover how to Handle the missing data by using the `.dropna()` function. 

In [7]:
apple_df.dropna()

Unnamed: 0,AAPL
0,72.27
2,74.39
4,75.94
5,76.93
7,78.74
9,79.81
11,79.53
13,79.56
14,79.49


In [8]:
# """ See what happened? The NaN value is gone along with its whole row of data. 
# Now, only two rows of data remain. As you can see, 
# dropna offers a quick and powerful tool for handling missing data.

# Still, we can’t ignore the fact that using dropna just threw away one third of our data. 
# This is fine for an example, 
# but it’s hard to imagine a scenario where eliminating that much data wouldn’t skew some aspect of our analysis. 
# This especially applies in a professional setting, where every data point is thought to have value. 
# So, we have another technique for handling missing data—namely, filling, it. """
# - Bootcamp Spot

In [9]:
# This section coveres Filling missing data using the `.fillna` function.

In [10]:
# Most-common replacement values are: "Unknown", "0", "mean()"

In [16]:
apple_df.dropna().sum()

AAPL    696.66
dtype: float64

In [11]:
apple_df.fillna("Unknown") 

Unnamed: 0,AAPL
0,72.27
1,Unknown
2,74.39
3,Unknown
4,75.94
5,76.93
6,Unknown
7,78.74
8,Unknown
9,79.81


In [12]:
apple_df.fillna(0)

Unnamed: 0,AAPL
0,72.27
1,0.0
2,74.39
3,0.0
4,75.94
5,76.93
6,0.0
7,78.74
8,0.0
9,79.81


In [13]:
apple_df.fillna(apple_df.mean()) # `.mean()` calculates the mean or average for each column of apple data frame.
# This is a quick short cut to fill in the missing data.

Unnamed: 0,AAPL
0,72.27
1,77.406667
2,74.39
3,77.406667
4,75.94
5,76.93
6,77.406667
7,78.74
8,77.406667
9,79.81


In [14]:
apple_df.loc[:, "AAPL"] = apple_df.loc[:, "AAPL"].fillna(0)
apple_df.isnull().sum() # this uses the fillna function on a single column, 
# giving you more control over which values it replaces

AAPL    0
dtype: int64

In [15]:
# "The code uses .loc[:. "AAPL"] to isolate all cells in the AAPL column. 
# It then calls the fillna function with 0 as the argument. 
# The output has NaN replaced with a value of 0." - Bootcamp Spot.