In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Handle Missing Data

`None` is used a placeholder of missing data/object

In [2]:
import numpy as np
import pandas as pd

In [3]:
A = np.array([0, None, 2, 3], dtype='object')
A

array([0, None, 2, 3], dtype=object)

In [4]:
print(A[1])

None


In [5]:
A[1] # the value is missing

In [6]:
# try to compute the sum
#A.sum() # error!

`nan` is used to represent a numerical missing value <br>
type of `nan` is float

In [7]:
A = np.array([0, np.nan, 2, 3], dtype='float64')
A

array([ 0., nan,  2.,  3.])

In [8]:
A.sum()

nan

In [9]:
1 + np.nan

nan

In [10]:
10 * np.nan

nan

In [11]:
np.nan == np.nan

False

In [12]:
np.isnan(10)

False

In [13]:
np.isnan(np.nan) #check if a number is nan

True

In [14]:
np.isnan(A[1])

True

In [15]:
A

array([ 0., nan,  2.,  3.])

In [16]:
A[~np.isnan(A)] #remove nan from the array,  ~ means 'not'

array([0., 2., 3.])

### Handle none and nan in Series

In [17]:
ser= pd.Series([1, np.nan, 'hello', None])
ser

0        1
1      NaN
2    hello
3     None
dtype: object

In [18]:
ser.isnull() # null refers to nan and None

0    False
1     True
2    False
3     True
dtype: bool

In [19]:
ser.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [20]:
ser.isnull().sum()

2

remove nan and None from the Series

In [21]:
ser[ser.notnull()]

0        1
2    hello
dtype: object

If a Series only contains numerical data, we can convert it to array, <br> then handle missing data using Numpy functions

In [22]:
arr = pd.Series([0, np.nan, None, 1, 100])
arr = arr.values # None -> nan
arr

array([  0.,  nan,  nan,   1., 100.])

In [23]:
arr.dtype

dtype('float64')

In [24]:
np.isnan(arr)

array([False,  True,  True, False, False])

In [25]:
# np.is_not_nan
~np.isnan(arr) # the tilde symbol/operator performs logical not

array([ True, False, False,  True,  True])

remove nan from the array

In [26]:
arr[~np.isnan(arr)]

array([  0.,   1., 100.])

#### Find and Replace none and nan in Series

In [27]:
ser= pd.Series([1, np.nan, 'hello', None])
ser[ser.isnull()] = 'nothing'  # replace nan/none with 'nothing'
ser

0          1
1    nothing
2      hello
3    nothing
dtype: object

In [28]:
# If a Series only contains numerical data, we can convert it to array
# then handle missing data using Numpy functions
ser = pd.Series([0, np.nan, None, 1, 100])
arr = ser.values
arr

array([  0.,  nan,  nan,   1., 100.])

In [29]:
# repalce nan with 0 (or other number)
arr[np.isnan(arr)]= 0
arr

array([  0.,   0.,   0.,   1., 100.])

## Handle nan in DataFrame

In [30]:
df = pd.read_csv('patient_record_missing_data.csv', sep=',')
#null -> nana, open the file in a text editor
df

Unnamed: 0,Age,Gender,Tumor_size_mm
0,30.0,,1.0
1,40.0,F,2.0
2,85.0,F,0.1
3,75.0,M,1.0
4,,F,3.0


### check the dataframe to see if there is any missing values

In [31]:
df.isnull()

Unnamed: 0,Age,Gender,Tumor_size_mm
0,False,True,False
1,False,False,False
2,False,False,False
3,False,False,False
4,True,False,False


In [32]:
df.isnull().sum(axis=0)

Age              1
Gender           1
Tumor_size_mm    0
dtype: int64

In [33]:
df.isnull().sum(axis=1)

0    1
1    0
2    0
3    0
4    1
dtype: int64

Suppose that we are developing a machine learning algorithm that will predict the outcome of brain tumor surgery based on Age and Sex <br>
<br>
### we need to find the rows that have missing data

In [34]:
#check if a number is nan
x = np.nan
print(x == np.nan)

False


nan is not equal to nan

In [35]:
#check if a number is nan
x = np.nan
np.isnan(x)

True

### write a program to find the rows with missing data (nan and None)

In [36]:
bad_row_index_list=[]
for n in range(0, df.shape[0]):
    n_missings=df.iloc[n,:].isnull().sum()
    if n_missings>0:
        bad_row_index_list.append(n)
bad_row_index_list

[0, 4]

In [44]:
n_missings=df.iloc[0,:].isnull().sum()
n_missings

1

### How should we handle the data of the patient-0 (row-0) and patient-4(row-4) ?


option-1: we could remove all of the 'bad' rows

In [38]:
df

Unnamed: 0,Age,Gender,Tumor_size_mm
0,30.0,,1.0
1,40.0,F,2.0
2,85.0,F,0.1
3,75.0,M,1.0
4,,F,3.0


In [39]:
df_clean = df.drop(bad_row_index_list, axis=0)
df_clean

Unnamed: 0,Age,Gender,Tumor_size_mm
1,40.0,F,2.0
2,85.0,F,0.1
3,75.0,M,1.0


option-2: we could modify some of the 'bad' rows

In [40]:
age_avg=df.iloc[0:4,0].mean()
age_avg

57.5

In [41]:
df.iloc[4,0]=age_avg

In [42]:
df

Unnamed: 0,Age,Gender,Tumor_size_mm
0,30.0,,1.0
1,40.0,F,2.0
2,85.0,F,0.1
3,75.0,M,1.0
4,57.5,F,3.0


In [43]:
df_clean=df.drop(0, axis=0) # remove the first row
df_clean

Unnamed: 0,Age,Gender,Tumor_size_mm
1,40.0,F,2.0
2,85.0,F,0.1
3,75.0,M,1.0
4,57.5,F,3.0


Now, the data is clean and ready for analysis (e.g. machine learning)

### what should we do if every row has a nan  and every column has a nan ? <br>
we cannot just remove these rows and columns - otherwise there will be no data left