In [1]:
"""
What? Mark and remove missing data.

Handling missing data is important as many machine learning algorithms do not 
support data with missing values. One example of is the LDA
Linear Discriminant Analysis
"""

In [2]:
# Import python modules
from numpy import nan
from IPython.display import Markdown, display

In [3]:
# Additional functions
def myPrint(string, c = "blue"):    
    """My version of the python-native print command.
    
    Print in bold and red tect
    """
    colorstr = "<span style='color:{}'>{}</span>".format(c, '**'+ string + '**' )    
    display(Markdown(colorstr))

def printPythonModuleVersion():    
    """printPythonModuleVersion
    Quickly list the python module versions
    """
    myPrint("Checking main python modules version")
    import scipy
    print('scipy: %s' % scipy.__version__)
    import numpy
    print('numpy: %s' % numpy.__version__)    
    import matplotlib
    print('matplotlib: %s' % matplotlib.__version__)    
    import pandas
    print('pandas: %s' % pandas.__version__)
    import statsmodels
    print('statsmodels: %s' % statsmodels.__version__) 
    import sklearn
    print('sklearn: %s' % sklearn.__version__)
    import xgboost
    print('xgboostn: %s' % xgboost.__version__)    

printPythonModuleVersion()

<span style='color:blue'>**Checking main python modules version**</span>

scipy: 1.5.4
numpy: 1.19.4
matplotlib: 3.3.2
pandas: 1.1.4
statsmodels: 0.12.1
sklearn: 0.23.2
xgboostn: 1.2.1


In [4]:
myPrint("Load dataset")
# load and summarize the dataset
from pandas import read_csv
# load the dataset
dataset = read_csv('../DATASETS/pima-indians-diabetes.csv', header=None)

<span style='color:blue'>**Load dataset**</span>

In [5]:
myPrint("Looking for clues of missing data")

"""
LOOK AT MINIMUM VALUE
We can see that there are columns that have a minimum value of zero (0).
Missing values are frequently indicated by out-of-range entries; perhaps 
a negative number (e.g., -1) in a numeric field that is normally only 
positive, or a 0 in a numeric field that can never normally be 0.
"""

# summarize the dataset
dataset.describe()

<span style='color:blue'>**Looking for clues of missing data**</span>

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# Let us confirm it by looking at the data
dataset.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# count the number of missing values for each column 
num_missing = (dataset[[1,2,3,4,5,6,7]] == 0).sum()
# report the results
num_missing

1      5
2     35
3    227
4    374
5     11
6      0
7      0
dtype: int64

In [8]:
"""
In Pandas, NumPy and Scikit-Learn, we mark missing values as NaN. 
Values with a NaN value are ignored from operations like sum, count, etc.
"""

dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan) 
# count the number of nan values in each column p
# so we can check we have marked them correctly
dataset.isnull().sum()

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64

In [9]:
# checking NaN has been used in the inputs
dataset.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [14]:
print("Shape BEFORE dropping the NaN entries:", dataset.shape)
# drop rows with missing values
dataset.dropna(inplace = True)
# summarize the shape of the data with missing rows removed 
print("Shape AFTER dropping the NaN entries:", dataset.shape)

Shape BEFORE dropping the NaN entries: (768, 9)
Shape AFTER dropping the NaN entries: (392, 9)
