# Treating missing values

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [2]:
missing = np.nan
#one way
series = Series([2,4,missing, 6, missing, 8, 10, missing])
l1 = []
for i in range(11):
    if i % 3 == 0:
        l1.append(missing)
    else:
        l1.append(i)
series2 = Series(l1)
#print(series, series2)

In [3]:
series

0     2.0
1     4.0
2     NaN
3     6.0
4     NaN
5     8.0
6    10.0
7     NaN
dtype: float64

In [4]:
series2

0      NaN
1      1.0
2      2.0
3      NaN
4      4.0
5      5.0
6      NaN
7      7.0
8      8.0
9      NaN
10    10.0
dtype: float64

In [5]:
print(series,'\n\n')
b = series.isnull()
print(b, '\n\n')
for i in zip(series, b):
    print(i[0], i[1], sep = "\t")

0     2.0
1     4.0
2     NaN
3     6.0
4     NaN
5     8.0
6    10.0
7     NaN
dtype: float64 


0    False
1    False
2     True
3    False
4     True
5    False
6    False
7     True
dtype: bool 


2.0	False
4.0	False
nan	True
6.0	False
nan	True
8.0	False
10.0	False
nan	True


In [6]:
# implementation of finding nan and replacing
count = 0
for i in range(len(b)):
    if b[i]:
        series[i] = 'ok'
    else:
        pass

In [7]:
series

0     2.0
1     4.0
2      ok
3     6.0
4      ok
5     8.0
6    10.0
7      ok
dtype: object

In [8]:
series2

0      NaN
1      1.0
2      2.0
3      NaN
4      4.0
5      5.0
6      NaN
7      7.0
8      8.0
9      NaN
10    10.0
dtype: float64

In [9]:
na_0 = series2.fillna(0)
na_0

0      0.0
1      1.0
2      2.0
3      0.0
4      4.0
5      5.0
6      0.0
7      7.0
8      8.0
9      0.0
10    10.0
dtype: float64

In [10]:
mean = series2.mean()
na_mean = series2.fillna(mean)
na_mean

0      5.285714
1      1.000000
2      2.000000
3      5.285714
4      4.000000
5      5.000000
6      5.285714
7      7.000000
8      8.000000
9      5.285714
10    10.000000
dtype: float64

In [11]:
median = series2.median()
na_median = series2.fillna(median)
na_median

0      5.0
1      1.0
2      2.0
3      5.0
4      4.0
5      5.0
6      5.0
7      7.0
8      8.0
9      5.0
10    10.0
dtype: float64

In [12]:
mode = series2.mode()
mode

0     1.0
1     2.0
2     4.0
3     5.0
4     7.0
5     8.0
6    10.0
dtype: float64

In [13]:
# To experiment with mode
series_mode = Series((1,2,1,2,3,missing,2,missing))
mode = series_mode.mode()
na_mode = series_mode.fillna(mode)
na_mode


0    1.0
1    2.0
2    1.0
3    2.0
4    3.0
5    NaN
6    2.0
7    NaN
dtype: float64

In [14]:
# To experiment with mode
series_mode = Series((1,2,1,2,3,missing,2,missing))
mode = series_mode.mode()
na_mode = series_mode.fillna(mode[0])
na_mode

0    1.0
1    2.0
2    1.0
3    2.0
4    3.0
5    2.0
6    2.0
7    2.0
dtype: float64

In [15]:
np.random.seed(25)
df = DataFrame(np.random.randint(1,99,36).reshape(6,6))
df

Unnamed: 0,0,1,2,3,4,5
0,5,63,91,16,62,24
1,45,51,9,29,5,90
2,32,70,2,40,4,89
3,56,4,85,46,4,2
4,23,32,49,48,74,17
5,51,86,37,10,33,11


In [16]:
df.loc[2:5, 0] = missing
df.loc[1:4, 1] = missing

In [17]:
df

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24
1,45.0,,9,29,5,90
2,,,2,40,4,89
3,,,85,46,4,2
4,,,49,48,74,17
5,,86.0,37,10,33,11


In [18]:
df.isnull()

Unnamed: 0,0,1,2,3,4,5
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,True,True,False,False,False,False
3,True,True,False,False,False,False
4,True,True,False,False,False,False
5,True,False,False,False,False,False


In [19]:
df.fillna(0)

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24
1,45.0,0.0,9,29,5,90
2,0.0,0.0,2,40,4,89
3,0.0,0.0,85,46,4,2
4,0.0,0.0,49,48,74,17
5,0.0,86.0,37,10,33,11


In [20]:
df

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24
1,45.0,,9,29,5,90
2,,,2,40,4,89
3,,,85,46,4,2
4,,,49,48,74,17
5,,86.0,37,10,33,11


In [21]:
filled_df = df.fillna({0:0, 1:1})
filled_df

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24
1,45.0,1.0,9,29,5,90
2,0.0,1.0,2,40,4,89
3,0.0,1.0,85,46,4,2
4,0.0,1.0,49,48,74,17
5,0.0,86.0,37,10,33,11


In [22]:
df.fillna(method = "ffill")
# To fill previous element value
# accepted values for method are
# ffill, bfill, backfill, pad

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24
1,45.0,63.0,9,29,5,90
2,45.0,63.0,2,40,4,89
3,45.0,63.0,85,46,4,2
4,45.0,63.0,49,48,74,17
5,45.0,86.0,37,10,33,11


## counting missing values

In [23]:
df.isnull().sum()
# in 0th row 4 nulls
# in 1st row 4 nulls

0    4
1    4
2    0
3    0
4    0
5    0
dtype: int64

In [24]:
df.dropna() # drops rows

Unnamed: 0,0,1,2,3,4,5
0,5.0,63.0,91,16,62,24


In [25]:
df.dropna(axis = 1) # drops columns

Unnamed: 0,2,3,4,5
0,91,16,62,24
1,9,29,5,90
2,2,40,4,89
3,85,46,4,2
4,49,48,74,17
5,37,10,33,11


# Previous

In [26]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [27]:
missing = np.nan

series_obj = Series(['row1', 'row2', missing, 'row4','row5', 'row6', missing, 'row8'])
series_obj

0    row1
1    row2
2     NaN
3    row4
4    row5
5    row6
6     NaN
7    row8
dtype: object

In [28]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [29]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6,6)))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [30]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [31]:
filled_df = DF_obj.fillna(0)
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [32]:
filled_df = DF_obj.fillna({0: 5, 5:10})
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,10.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,10.0
3,5.0,0.836375,0.481343,0.516502,0.383048,10.0
4,5.0,0.559053,0.03445,0.71993,0.421004,10.0
5,5.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [33]:
filled_df = DF_obj.fillna(method = 'ffill')
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


# counting missing values

In [34]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6,6)))
DF_obj
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [35]:
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

In [36]:
DF_no_nan = DF_obj.dropna()
DF_no_nan

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [37]:
DF_no_nan = DF_obj.dropna(axis = 1)
DF_no_nan

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804


# The End