# Detect and correct errors in data #

-missing values = NaN

How do you detect them


In [5]:
import pandas as pd

items = [{'bikes':20, 'cars':10000, 'pants':200}, 
      {'watches':35},
      {'bikes':150, 'pants':150}]

store_items = pd.DataFrame(items, index=['store 1', 'store 2', 'store 3'])
store_items

Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,
store 2,,,,35.0
store 3,150.0,,150.0,


## count the number of NaN values ##

In [8]:
x =  store_items.isnull()
print(x)

         bikes   cars  pants  watches
store 1  False  False  False     True
store 2   True   True   True    False
store 3  False   True  False     True


In [9]:
x = store_items.isnull().sum()
print(x)

bikes      1
cars       2
pants      1
watches    2
dtype: int64


In [11]:
x = store_items.isnull().sum().sum()
print(x)

6


## count the number of non-NaN numbers ##

In [12]:
store_items.count()


bikes      2
cars       1
pants      2
watches    1
dtype: int64

## to remove NaN numbers ##

drop all rows that have NaN

In [13]:
store_items.dropna(axis=0)

Unnamed: 0,bikes,cars,pants,watches


drop all columns that have NaN

In [14]:
store_items.dropna(axis=1)

store 1
store 2
store 3


In [15]:
store_items

Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,
store 2,,,,35.0
store 3,150.0,,150.0,


instead of eliminating, replace the NaNs

In [16]:
store_items.fillna(0)

Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,0.0
store 2,0.0,0.0,0.0,35.0
store 3,150.0,0.0,150.0,0.0


fill the value with the previous value along the specified axis, using 'forward fill'

does not replace NaN if there is no previous value

In [17]:
store_items.fillna(method='ffill', axis=0)

  store_items.fillna(method='ffill', axis=0)


Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,
store 2,20.0,10000.0,200.0,35.0
store 3,150.0,10000.0,150.0,35.0


In [18]:
store_items.fillna(method='ffill', axis=1)

  store_items.fillna(method='ffill', axis=1)


Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,200.0
store 2,,,,35.0
store 3,150.0,150.0,150.0,150.0


call also replace them with backward filling

In [20]:
print(store_items)

store_items.fillna(method='backfill', axis=0)

         bikes     cars  pants  watches
store 1   20.0  10000.0  200.0      NaN
store 2    NaN      NaN    NaN     35.0
store 3  150.0      NaN  150.0      NaN


  store_items.fillna(method='backfill', axis=0)


Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,35.0
store 2,150.0,,150.0,35.0
store 3,150.0,,150.0,


use interpolation 

In [22]:
print(store_items)
store_items.interpolate(method='linear', axis=0)

         bikes     cars  pants  watches
store 1   20.0  10000.0  200.0      NaN
store 2    NaN      NaN    NaN     35.0
store 3  150.0      NaN  150.0      NaN


Unnamed: 0,bikes,cars,pants,watches
store 1,20.0,10000.0,200.0,
store 2,85.0,10000.0,175.0,35.0
store 3,150.0,10000.0,150.0,35.0
