# Learning objectives
By the end of this lecture, you will know:
- How to identify missing data in a DataFrame
- How to inspect, filter, drop, or fill missing data
- Learn how to inspect and fix data types in a DataFrame
- How to save your cleaned DataFrame to a CSV file.

  

# Functions vs. Methods

In [1]:
# We have seen functions:
mylist = [2,3,4]
len(mylist)

3

In [3]:
# We have also seen methods:
import pandas as pd
mydf = pd.DataFrame({'A':mylist})
print(mydf)
mydf.mean()

   A
0  2
1  3
2  4


Unnamed: 0,0
A,3.0


# Checking for missing data

In [5]:
# In Python, missing data are represented as NaN (not a number)
import numpy as np
np.nan

nan

In [9]:
# Example dataset
patients = pd.DataFrame({
    'id': [1,2,3,4],
    'age': [34, np.nan, 45, 29],
    'smoker': ['yes',np.nan,np.nan,'no'],
    'bmi': [25.3, 27.8, np.nan, 24.5]
})
print(patients)

   id   age smoker   bmi
0   1  34.0    yes  25.3
1   2   NaN    NaN  27.8
2   3  45.0    NaN   NaN
3   4  29.0     no  24.5


In [7]:
print(patients.isna())

      id    age  smoker    bmi
0  False  False   False  False
1  False   True   False  False
2  False  False    True   True
3  False  False   False  False


In [10]:
print(patients.isna().sum())

id        0
age       1
smoker    2
bmi       1
dtype: int64


## Applying this to Framingham dataset

In [11]:
# Make Google Drive available to the script
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# Let's load the Framingham Heart Study dataset
filename = 'drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data/frmgham2.csv'
frame = pd.read_csv(filename)
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766


In [13]:
# Checking for missing values
print(frame.isna())

       RANDID    SEX  TOTCHOL    AGE  SYSBP  DIABP  CURSMOKE  CIGPDAY    BMI  \
0       False  False    False  False  False  False     False    False  False   
1       False  False    False  False  False  False     False    False   True   
2       False  False    False  False  False  False     False    False  False   
3       False  False    False  False  False  False     False    False  False   
4       False  False    False  False  False  False     False    False  False   
...       ...    ...      ...    ...    ...    ...       ...      ...    ...   
11622   False  False    False  False  False  False     False    False  False   
11623   False  False    False  False  False  False     False    False  False   
11624   False  False    False  False  False  False     False    False  False   
11625   False  False    False  False  False  False     False    False  False   
11626   False  False     True  False  False  False     False    False  False   

       DIABETES  ...    CVD  HYPERTEN  

In [14]:
# Summarize the total missings per column
print(frame.isna().sum())

RANDID         0
SEX            0
TOTCHOL      409
AGE            0
SYSBP          0
DIABP          0
CURSMOKE       0
CIGPDAY       79
BMI           52
DIABETES       0
BPMEDS       593
HEARTRTE       6
GLUCOSE     1440
educ         295
PREVCHD        0
PREVAP         0
PREVMI         0
PREVSTRK       0
PREVHYP        0
TIME           0
PERIOD         0
HDLC        8600
LDLC        8601
DEATH          0
ANGINA         0
HOSPMI         0
MI_FCHD        0
ANYCHD         0
STROKE         0
CVD            0
HYPERTEN       0
TIMEAP         0
TIMEMI         0
TIMEMIFC       0
TIMECHD        0
TIMESTRK       0
TIMECVD        0
TIMEDTH        0
TIMEHYP        0
dtype: int64


In [15]:
frame[['HDLC','LDLC','TOTCHOL']]

Unnamed: 0,HDLC,LDLC,TOTCHOL
0,,,195.0
1,31.0,178.0,209.0
2,,,250.0
3,,,260.0
4,54.0,141.0,237.0
...,...,...,...
11622,,,173.0
11623,30.0,123.0,153.0
11624,,,196.0
11625,,,240.0


In [16]:
# How many people have at least one missing data point?
frame['missing_per_row'] = frame.isna().sum(axis=1)

In [17]:
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,2
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,1
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0


In [20]:
frame[['RANDID','missing_per_row']].groupby('RANDID').sum().sort_values('missing_per_row')

Unnamed: 0_level_0,missing_per_row
RANDID,Unnamed: 1_level_1
2838795,2
9433633,2
9431604,2
2650858,2
8330956,2
...,...
2449215,12
4162270,12
6603907,12
6750830,12


# Dealing with missings: dropping rows/columns

In [21]:
# Drop rows with ANY missing value:
frame.dropna()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.50,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0
14,11263,2,220.0,55,180.0,106.0,0,0.0,31.17,1,...,1,8766,8766,5719,5719,8766,5719,8766,0,0
19,12806,2,320.0,57,110.0,46.0,1,30.0,22.02,0,...,1,8766,8766,8766,8766,8766,8766,8766,8679,0
22,14367,1,280.0,64,168.0,100.0,0,0.0,25.72,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,0
25,16365,1,211.0,55,173.0,123.0,0,0.0,29.11,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11603,9978986,2,243.0,68,110.0,66.0,0,0.0,24.30,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,0
11606,9982118,1,219.0,70,163.5,75.0,0,0.0,25.26,0,...,1,8457,8457,8457,8457,8346,8346,8457,0,0
11615,9990894,2,228.0,60,205.0,100.0,1,20.0,22.37,0,...,1,6433,6433,6433,6433,6433,6433,6433,2219,0
11618,9993179,2,251.0,56,145.0,92.0,1,35.0,21.97,0,...,1,6729,6729,6729,6729,6729,6729,6729,4396,0


In [22]:
# Drop rows ONLY if ALL the values are missing
frame.dropna(how = 'all')

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,2
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,1
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.50,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11622,9998212,1,173.0,46,126.0,82.0,0,0.0,19.17,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,3
11623,9998212,1,153.0,52,143.0,89.0,0,0.0,25.74,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,0
11624,9999312,2,196.0,39,133.0,86.0,1,30.0,20.91,0,...,1,8766,8766,8766,8766,8766,8766,8766,4201,2
11625,9999312,2,240.0,46,138.0,79.0,1,20.0,26.39,0,...,1,8766,8766,8766,8766,8766,8766,8766,4201,2


In [23]:
# Drop columns with ANY missing value
frame.dropna(axis=1)

Unnamed: 0,RANDID,SEX,AGE,SYSBP,DIABP,CURSMOKE,DIABETES,PREVCHD,PREVAP,PREVMI,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
0,2448,1,39,106.0,70.0,0,0,0,0,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,2
1,2448,1,52,121.0,66.0,0,0,0,0,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,1
2,6238,2,46,121.0,81.0,0,0,0,0,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
3,6238,2,52,105.0,69.5,0,0,0,0,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
4,6238,2,58,108.0,66.0,0,0,0,0,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11622,9998212,1,46,126.0,82.0,0,0,0,0,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,3
11623,9998212,1,52,143.0,89.0,0,0,0,0,0,...,1,8766,8766,8766,8766,8766,8766,8766,0,0
11624,9999312,2,39,133.0,86.0,1,0,0,0,0,...,1,8766,8766,8766,8766,8766,8766,8766,4201,2
11625,9999312,2,46,138.0,79.0,1,0,0,0,0,...,1,8766,8766,8766,8766,8766,8766,8766,4201,2


# Dealing with missings: filling missing data

In [24]:
# Fill with a value:
frame['TOTCHOL'] = frame['TOTCHOL'].fillna(99)

In [25]:
frame.head(15)

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,2
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,1
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0
5,9428,1,245.0,48,127.5,80.0,1,20.0,25.34,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
6,9428,1,283.0,54,141.0,89.0,1,30.0,25.34,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
7,10552,2,225.0,61,150.0,95.0,1,30.0,28.58,0,...,1,2956,2956,2956,2956,2089,2089,2956,0,2
8,10552,2,232.0,67,183.0,109.0,1,20.0,30.18,0,...,1,2956,2956,2956,2956,2089,2089,2956,0,2
9,11252,2,285.0,46,130.0,84.0,1,23.0,23.1,0,...,1,8766,8766,8766,8766,8766,8766,8766,4285,2


In [26]:
# Fill with mean of the variable
frame['BMI'] = frame['BMI'].fillna( frame['BMI'].mean() )

In [27]:
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,missing_per_row
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,2
1,2448,1,209.0,52,121.0,66.0,0,0.0,25.877349,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,1
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,2
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,0


# Detecting duplicates

In [30]:
frame.duplicated().sum()

np.int64(0)

In [31]:
# If you have any duplicates:
frame = frame.drop_duplicates()

# Variable types revisited

In [32]:
frame.dtypes

Unnamed: 0,0
RANDID,int64
SEX,int64
TOTCHOL,float64
AGE,int64
SYSBP,float64
DIABP,float64
CURSMOKE,int64
CIGPDAY,float64
BMI,float64
DIABETES,int64


In [33]:
frame['AGE'] = frame['AGE'].astype(float)

In [34]:
frame.dtypes

Unnamed: 0,0
RANDID,int64
SEX,int64
TOTCHOL,float64
AGE,float64
SYSBP,float64
DIABP,float64
CURSMOKE,int64
CIGPDAY,float64
BMI,float64
DIABETES,int64


# Storing your cleaned dataframe

In [35]:
# Let's load the Framingham Heart Study dataset
filename = 'drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data/frmgham2_cleaned.csv'
frame.to_csv(filename)