In [1]:
# 7.1 Handling Missing Data

In [5]:
# for data with float64 dtype, pandas uses the floating-point value NaN (Not a Number) to represent misssing data
# this is called  asentienl value; when present, it indicates a missing (or null) value
import pandas as pd
import numpy as np

float_data = pd.Series([1.2,-3.5,np.nan,0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [6]:
# .isna() gives a Boolean Series of result; null = TRUE
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
# as alternative to 'NA', a built-in 'None' (case-sensitive) value is also used
string_data = pd.Series(["aardvark",np.nan,None,"avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [9]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [11]:
float_data = pd.Series([1,2,None],dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [12]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [14]:
# NA handling object methods
# .dropna() : filter axis labels based on whether values for eahc label have missing data
#          with varying thresholds for how much missing data to tolerate
# .fillna() : fill in missing data with some value or using an interpolation method such as "ffill" or "bfill"
# .isna() : return Boolean values indicating which values are missing/NA
# .notna() : negation of isna, returns True for non-NA values and False for NA values

In [15]:
# Filtering Out Missing Data

In [17]:
# using .dropna() in Series
data = pd.Series([1,np.nan,3.5,np.nan,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [18]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
# another way to filter out missing data in Series
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [20]:
# removing missing data in DataFrames - either drop rows or columns that are all NA, or only dropping rows or columns containing any NA
# .dropna() by default drops any row containing a missing value
data = pd.DataFrame([[1,6.5,3.],[1.,np.nan,np.nan],
                     [np.nan,np.nan,np.nan],[np.nan,6.5,3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
data.dropna()
# drops all the rows containing any NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [22]:
# passing how = "all" will drop only rows that are all NA
data.dropna(how="all")

# these functions don't modify the original object, just creates a new object

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
# to drop columns containing NA, pass the argument axis = "columns"
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [27]:
data.dropna(axis="columns",how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [29]:
# only keeping rows contianing at most a certain number of missing observations
# indicate this with the thresh argument
df = pd.DataFrame(np.random.standard_normal((7,3)))
df

Unnamed: 0,0,1,2
0,-0.183961,0.584176,-0.009744
1,0.424663,0.639363,-1.631365
2,1.246075,-0.65688,0.14167
3,-1.564117,-0.934767,-0.958873
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [30]:
df.iloc[:4,1] = np.nan
df.iloc[:2,2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.183961,,
1,0.424663,,
2,1.246075,,0.14167
3,-1.564117,,-0.958873
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [31]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [32]:
# using argument thresh
df.dropna(thresh=2)
# thresh = 2 means non-NaN threshold is 2 (at least 2 non-NaN values), if threshold not met, drop
# thresh = N means the row needs N "votes" to stay in the DataFrame

Unnamed: 0,0,1,2
2,1.246075,,0.14167
3,-1.564117,,-0.958873
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [33]:
# Filling In Missing Data

In [34]:
# .fillna() : filling NaN with a constant
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.183961,0.0,0.0
1,0.424663,0.0,0.0
2,1.246075,0.0,0.14167
3,-1.564117,0.0,-0.958873
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [35]:
# using fillna() with a dictioinary, can use a different fill value for each column
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.183961,0.5,0.0
1,0.424663,0.5,0.0
2,1.246075,0.5,0.14167
3,-1.564117,0.5,-0.958873
4,-0.900347,-0.880182,1.765004
5,-0.358274,-0.94902,0.039001
6,0.311669,0.920131,-0.176968


In [36]:
# the same interpolation methods available for reindexing can be used with fillna()
df = pd.DataFrame(np.random.standard_normal((6,3)))
df

Unnamed: 0,0,1,2
0,-1.701416,-0.423199,1.567683
1,-0.145776,0.434876,-0.572423
2,-0.300954,0.419033,-0.021937
3,0.094289,0.647244,1.811238
4,-0.420911,0.92243,0.013012
5,-0.957278,2.170226,-1.507136


In [37]:
df.iloc[2:,1] = np.nan
df

Unnamed: 0,0,1,2
0,-1.701416,-0.423199,1.567683
1,-0.145776,0.434876,-0.572423
2,-0.300954,,-0.021937
3,0.094289,,1.811238
4,-0.420911,,0.013012
5,-0.957278,,-1.507136


In [38]:
df.iloc[4:,2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.701416,-0.423199,1.567683
1,-0.145776,0.434876,-0.572423
2,-0.300954,,-0.021937
3,0.094289,,1.811238
4,-0.420911,,
5,-0.957278,,


In [39]:
df.fillna(method="ffill")
# "ffill" means forward fill; as in Excel "Duplicate Down"
# the NaN values are filled with the preceding non-NaN values, however, ffill stops when it hits a new non-NaN value
# opposite of "ffill" is "bfill" (back fill)

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-1.701416,-0.423199,1.567683
1,-0.145776,0.434876,-0.572423
2,-0.300954,0.434876,-0.021937
3,0.094289,0.434876,1.811238
4,-0.420911,0.434876,1.811238
5,-0.957278,0.434876,1.811238


In [40]:
df.fillna(method="ffill",limit=2)

# "limit = 2" means limit the fill to 2 rows

  df.fillna(method="ffill",limit=2)


Unnamed: 0,0,1,2
0,-1.701416,-0.423199,1.567683
1,-0.145776,0.434876,-0.572423
2,-0.300954,0.434876,-0.021937
3,0.094289,0.434876,1.811238
4,-0.420911,,1.811238
5,-0.957278,,1.811238


In [41]:
# with .fillna() simple data imputation using median or mean statistics is available
data = pd.Series([1,np.nan,3.5,np.nan,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [42]:
data.fillna(data.mean())
# fill the NaN values with .mean()

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [43]:
# .fillna() function arguments
# value : scalar value or dictionary-like object to use to fill missing values
# method : interpolation method - one of "bfill" or "ffill"; default is None
# axis : axis to fill on ("index" or "columns"); default is axis="index"
# limit : for ffill and bfill, maximum number of consecutive periods to fill

In [44]:
# Data Transformation

In [45]:
# Removing Duplicates

In [50]:
# removing duplicate rows
data = pd.DataFrame({"k1":["one","two"]*3 + ["two"],"k2":[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [51]:
# DataFrame .duplicated() method returns a Boolean Series indicating whether each row is a duplicate oe nor
# duplicate means its column values are exactly equal to those in an earlier row
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [53]:
# relatedly, drop_duplicates() returns a DataFrame with rows where the duplicated array is False filtered out
data.drop_duplicates()

# both methods, .duplicated() and .drop_duplicates() by default consider all of the columns;
# alternatively, can specify any subset of them to detect duplicates

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [55]:
# having an additional column of values, and want to filter duplicates based only on the "k1" column
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [56]:
# set column with subset argument
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [58]:
# .duplicated() and .drop_duplicates() by default keep the first observed value combination
# passing keep="last" argument will return the last one
data.drop_duplicates(["k1","k2"],keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [64]:
# Transformming Dta Using a Function or Mapping
# transformation based on the values in an array, Series, or column in a DataFrame
data = pd.DataFrame({"food":["bacon","pulled pork","bacon","pastrami","corned beef","bacon","pastrami","honey ham","nova lox"],
                     "ounces":[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [65]:
# suppose we wanted to add a column indicating the type of animal for each food
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon"
}

meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [66]:
# .map() method on a Series accepts a function or dictionary-like object containing a mapping
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [67]:
# could also have passed a function that does all the work
def get_animal(x):
    return meat_to_animal[x]

data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [68]:
# using .map() is a convenient way to perform element-wise transformatioins and other data cleaning-related operations