# Missing Data in Pandas


</br>
In this notebook, I'm going to show missing data in Pandas.
</br>


In [1]:
import pandas as pd
import numpy as np

In [2]:
s=pd.Series(["Sam",np.nan,"Tim","Kim"])
s

0    Sam
1    NaN
2    Tim
3    Kim
dtype: object

In [3]:
s.isnull()

0    False
1     True
2    False
3    False
dtype: bool

In [5]:
s.notnull() #returns boolean as True where data is present, rest False

0     True
1    False
2     True
3     True
dtype: bool

In [6]:
s[3]=None
s.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [7]:
s.dropna() #deletes all the null rows

0    Sam
2    Tim
dtype: object

In [8]:
from numpy import nan as NA

In [9]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [10]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [11]:
df.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [12]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [13]:
df[1]=NA
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


In [14]:
df.dropna(axis=1,how="all") #drop along a particular axis

Unnamed: 0,0,2
0,1.0,3.0
1,4.0,5.0
2,,


In [22]:
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


how{‘any’, ‘all’}, default ‘any’
Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.

‘any’ : If any NA values are present, drop that row or column.

‘all’ : If all values are NA, drop that row or column.

threshint, optional
Require that many non-NA values. Cannot be combined with how.

subsetcolumn label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.

inplacebool, default False
Whether to modify the DataFrame rather than creating a new one.

ignore_indexbool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.

In [24]:
df.dropna(thresh=3) #Keep only the rows with at least 2 non-NA values.
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


In [27]:
df.fillna(0) #fill all nan values with 0

Unnamed: 0,0,1,2
0,1.0,0.0,3.0
1,4.0,0.0,5.0
2,0.0,0.0,0.0


In [28]:
df.fillna({0:15,1:25,2:35}) #fill the values which were nan by using their index as key value pair

Unnamed: 0,0,1,2
0,1.0,25.0,3.0
1,4.0,25.0,5.0
2,15.0,25.0,35.0


In [29]:
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


In [31]:
df.fillna(0,inplace=True) #set inplace to true to make the changes permanent
df

Unnamed: 0,0,1,2
0,1.0,0.0,3.0
1,4.0,0.0,5.0
2,0.0,0.0,0.0


In [32]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [33]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,2.0,5.0


In [34]:
df.fillna(method="ffill",limit=1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,,5.0


In [35]:
data=pd.Series([1,0,NA,5])
data

0    1.0
1    0.0
2    NaN
3    5.0
dtype: float64

In [36]:
data.fillna(data.mean())

0    1.0
1    0.0
2    2.0
3    5.0
dtype: float64

In [37]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [38]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,2.5,2.0,4.0
