In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.NA

<NA>

In [3]:
np.nan == np.nan
    # nan values do not exist, they do not equal anything, even itself 

False

In [4]:
np.nan is np.nan
    # nan values are null, they can be checked to confirm

True

In [6]:
variable_that_does_not_exist_because_i_have_not_created_it_yet = np.nan
variable_that_does_not_exist_because_i_have_not_created_it_yet is np.nan

True

In [13]:
df = pd.read_csv('movie_scores.csv')

df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [12]:
df.isnull()
    # .isnull() will return a boolean indicating if a value is nan or not

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [14]:
df.notnull()
    # .notnull() will return a boolean indicating if the value does contain data

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [15]:
df['pre_movie_score'].isnull()
    # individual columns can be called to be examined 

0    False
1     True
2     True
3    False
4    False
Name: pre_movie_score, dtype: bool

In [16]:
df[df['pre_movie_score'].notnull()]
    # you can also use conditional filtering to access only rows that have values in certain columns 

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [19]:
df[(df['pre_movie_score'].isnull()) & (df['first_name'].notnull())]
    # conditions can be combined as well 

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


In [20]:
## To keep data you make NO adjustments 

df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [21]:
## To drop missing values you can use .dropna() to remove any rows with missing data 

df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [22]:
    ## A threshold can be placed to establish how many nulls are acceptable 
df.dropna(thresh = 1)
    ## This informs pands that so long as a row has 1 value we can keep it 

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [23]:
    ## The axis can be switched so that .dropna() focuses on columns instead of rows 
        ## Axis is 0 by default
        ## 0 = Rows
        ## 1 = Columns
df.dropna(axis = 1)
    ## All columns will be dropped because all columns are missing at least 1 value 

0
1
2
3
4


In [24]:
df.dropna(axis = 1, thresh = 1)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [25]:
    ## Pandas can be advised to only consider certain features with subset
df.dropna(subset = ['pre_movie_score', 'post_movie_score'])
    ## Now only the explicitly mentioned features will be evaluated for nulls and dropped 

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [26]:
## To fill in missing data you may use .fillna()
df.fillna('NEW VALUE!')
    ## To properly fill values you must extrapolate what the value should be
    ## It is irresponsible to fill values because you can
    ## Any filled value is arbitrary and must be viewed as such
    ## Be able to defend your decision to fill values 

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63,m,8,10
1,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!
2,Hugh,Jackman,51,m,NEW VALUE!,NEW VALUE!
3,Oprah,Winfrey,66,f,6,8
4,Emma,Stone,31,f,7,9


In [27]:
    ## It is unwise to fill all nulls at once because it runs the rish of changing data types 
    ## Rows can be selected individually so that all nulls can have more consideration
df['pre_movie_score'].fillna(0)

0    8.0
1    0.0
2    0.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [28]:
    ## The dataframe feature must be reassigned for the change to stick 
df['pre_movie_score'] = df['pre_movie_score'].fillna(0)

df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,0.0,
2,Hugh,Jackman,51.0,m,0.0,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [29]:
    ## If you do not wish to use an arbitrary number you may use a mathematically derived number like mean
df['post_movie_score'] = df['post_movie_score'].fillna(df['post_movie_score'].mean())

df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,0.0,9.0
2,Hugh,Jackman,51.0,m,0.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0
