In [2]:
#Tutorial Overview
##1.Diabetes Dataset
##1.Mark missing values
##2.Missing values causes problems
##3.Remove rows missing values

### Mark Missing Values

In [1]:
## We can use plots and summary statistics to help identify the missing or corrupt data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Load the dataset
dataset = pd.read_csv('pima-indians-diabetes.csv',header = None)

#Summarize the dataset
print(dataset.describe())

                0           1           2           3           4           5  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

                6           7           8  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476951  
min      0.078000   21.000000    0.000000  
25%      0.243750   24.000000    0.000000  
50%   

In [6]:
print(dataset.head(20))

     0    1   2   3    4     5      6   7  8
0    6  148  72  35    0  33.6  0.627  50  1
1    1   85  66  29    0  26.6  0.351  31  0
2    8  183  64   0    0  23.3  0.672  32  1
3    1   89  66  23   94  28.1  0.167  21  0
4    0  137  40  35  168  43.1  2.288  33  1
5    5  116  74   0    0  25.6  0.201  30  0
6    3   78  50  32   88  31.0  0.248  26  1
7   10  115   0   0    0  35.3  0.134  29  0
8    2  197  70  45  543  30.5  0.158  53  1
9    8  125  96   0    0   0.0  0.232  54  1
10   4  110  92   0    0  37.6  0.191  30  0
11  10  168  74   0    0  38.0  0.537  34  1
12  10  139  80   0    0  27.1  1.441  57  0
13   1  189  60  23  846  30.1  0.398  59  1
14   5  166  72  19  175  25.8  0.587  51  1
15   7  100   0   0    0  30.0  0.484  32  1
16   0  118  84  47  230  45.8  0.551  31  1
17   7  107  74   0    0  29.6  0.254  31  1
18   1  103  30  38   83  43.3  0.183  33  0
19   1  115  70  30   96  34.6  0.529  32  1


In [7]:
### Count the number of missing values for each column
num_missing = (dataset[[1,2,3,4,5]] == 0).sum()

# report the result
print(num_missing)

1      5
2     35
3    227
4    374
5     11
dtype: int64


In [12]:
## replace 0 values with nan
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0,np.nan)

# count the number of nan values in each column

print(dataset.isnull().sum())

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


In [13]:
print(dataset.head(20))

     0      1     2     3      4     5      6   7  8
0    6  148.0  72.0  35.0    NaN  33.6  0.627  50  1
1    1   85.0  66.0  29.0    NaN  26.6  0.351  31  0
2    8  183.0  64.0   NaN    NaN  23.3  0.672  32  1
3    1   89.0  66.0  23.0   94.0  28.1  0.167  21  0
4    0  137.0  40.0  35.0  168.0  43.1  2.288  33  1
5    5  116.0  74.0   NaN    NaN  25.6  0.201  30  0
6    3   78.0  50.0  32.0   88.0  31.0  0.248  26  1
7   10  115.0   NaN   NaN    NaN  35.3  0.134  29  0
8    2  197.0  70.0  45.0  543.0  30.5  0.158  53  1
9    8  125.0  96.0   NaN    NaN   NaN  0.232  54  1
10   4  110.0  92.0   NaN    NaN  37.6  0.191  30  0
11  10  168.0  74.0   NaN    NaN  38.0  0.537  34  1
12  10  139.0  80.0   NaN    NaN  27.1  1.441  57  0
13   1  189.0  60.0  23.0  846.0  30.1  0.398  59  1
14   5  166.0  72.0  19.0  175.0  25.8  0.587  51  1
15   7  100.0   NaN   NaN    NaN  30.0  0.484  32  1
16   0  118.0  84.0  47.0  230.0  45.8  0.551  31  1
17   7  107.0  74.0   NaN    NaN  29.6  0.254 

### Missing Values Cause Problems

In [2]:
###Example where missing values causes errors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#load the dataset
data = pd.read_csv('pima-indians-diabetes.csv',header=None)

#replace 0 with nan
data[[1,2,3,4,5]]  = data[[1,2,3,4,5]].replace(0,np.nan)

#values
values = data.values
X = values[:,0:8]
y = values[:,8]

#Define Model
model = LinearDiscriminantAnalysis()

#Define the model Evaluation procedure

cv = KFold(n_splits=3,shuffle=True,random_state=1)

#evaluate the model

results = cross_val_score(model,X,y,cv=cv,scoring="accuracy")

#report the mean performance
print('Accuracy: %.3f'%results.mean())

Accuracy: nan


Traceback (most recent call last):
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\discriminant_analysis.py", line 425, in fit
    dtype=[np.float64, np.float32])
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\utils\validation.py", line 802, in check_X_y
    estimator=estimator)
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "D:\Anaconda_Distribution\envs\newEnv\lib\site-packages\skle

### Remove rows with Missing values

In [3]:
### We can pd.dropna() function to remove the missing data samples
data = pd.read_csv('pima-indians-diabetes.csv',header=None)

data[[1,2,3,4,5]] = data[[1,2,3,4,5]].replace(0,np.nan)

print(data.shape)

data.dropna(inplace=True)

print(data.shape)



(768, 9)
(392, 9)


In [7]:
#### Now evaluating the model with this data

values = data.values
X = values[:,:-1]
y = values[:,-1]

model = LinearDiscriminantAnalysis()

cv = KFold(n_splits=3,shuffle=True,random_state=1)

results = cross_val_score(model,X,y,cv=cv,scoring="accuracy")
mean = results.mean()*100
print("Accuracy is :%.3f"%mean)

Accuracy is :78.058
