# Using Python to read data files and explore their contents

We will be working with the NHANES (National Health and Nutrition Examination Survey) data from the 2015-2016 wave, which has been discussed earlier in this course. The raw data for this study are available here:

In [3]:
import numpy as np
import pandas as pd

In [5]:
url = "nhanes_2015_2016.csv"
data = pd.read_csv(url)

In [6]:
data.shape

(5735, 28)

In [7]:
data.columns

Index(['SEQN', 'ALQ101', 'ALQ110', 'ALQ130', 'SMQ020', 'RIAGENDR', 'RIDAGEYR',
       'RIDRETH1', 'DMDCITZN', 'DMDEDUC2', 'DMDMARTL', 'DMDHHSIZ', 'WTINT2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDFMPIR', 'BPXSY1', 'BPXDI1', 'BPXSY2',
       'BPXDI2', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC',
       'BMXWAIST', 'HIQ210'],
      dtype='object')

In [8]:
data.dtypes

SEQN          int64
ALQ101      float64
ALQ110      float64
ALQ130      float64
SMQ020        int64
RIAGENDR      int64
RIDAGEYR      int64
RIDRETH1      int64
DMDCITZN    float64
DMDEDUC2    float64
DMDMARTL    float64
DMDHHSIZ      int64
WTINT2YR    float64
SDMVPSU       int64
SDMVSTRA      int64
INDFMPIR    float64
BPXSY1      float64
BPXDI1      float64
BPXSY2      float64
BPXDI2      float64
BMXWT       float64
BMXHT       float64
BMXBMI      float64
BMXLEG      float64
BMXARML     float64
BMXARMC     float64
BMXWAIST    float64
HIQ210      float64
dtype: object

In [9]:
w = data['DMDEDUC2']
x = data.loc[:,'DMDEDUC2']
y = data.DMDEDUC2
z = data.iloc[:, 9]  

In [10]:
print(w)

0       5.0
1       3.0
2       3.0
3       5.0
4       4.0
       ... 
5730    3.0
5731    5.0
5732    4.0
5733    1.0
5734    5.0
Name: DMDEDUC2, Length: 5735, dtype: float64


In [11]:
print(x)

0       5.0
1       3.0
2       3.0
3       5.0
4       4.0
       ... 
5730    3.0
5731    5.0
5732    4.0
5733    1.0
5734    5.0
Name: DMDEDUC2, Length: 5735, dtype: float64


In [12]:
print(y)

0       5.0
1       3.0
2       3.0
3       5.0
4       4.0
       ... 
5730    3.0
5731    5.0
5732    4.0
5733    1.0
5734    5.0
Name: DMDEDUC2, Length: 5735, dtype: float64


In [13]:
print(z)

0       5.0
1       3.0
2       3.0
3       5.0
4       4.0
       ... 
5730    3.0
5731    5.0
5732    4.0
5733    1.0
5734    5.0
Name: DMDEDUC2, Length: 5735, dtype: float64


In [14]:
print(data["DMDEDUC2"].max())
print(data.loc[:, "DMDEDUC2"].max())
print(data.DMDEDUC2.max())
print(data.iloc[:, 9].max())

9.0
9.0
9.0
9.0


In [16]:
print(type(data)) # The type of the variable
print(type(data.DMDEDUC2)) # The type of one column of the data frame
print(type(data.iloc[2,:])) # The type of one row of the data frame

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [19]:
x = data.iloc[3, :]
print(x)

SEQN         83735.0
ALQ101           2.0
ALQ110           1.0
ALQ130           1.0
SMQ020           2.0
RIAGENDR         2.0
RIDAGEYR        56.0
RIDRETH1         3.0
DMDCITZN         1.0
DMDEDUC2         5.0
DMDMARTL         6.0
DMDHHSIZ         1.0
WTINT2YR    102718.0
SDMVPSU          1.0
SDMVSTRA       131.0
INDFMPIR         5.0
BPXSY1         132.0
BPXDI1          72.0
BPXSY2         134.0
BPXDI2          68.0
BMXWT          109.8
BMXHT          160.9
BMXBMI          42.4
BMXLEG          38.5
BMXARML         37.7
BMXARMC         38.3
BMXWAIST       110.1
HIQ210           2.0
Name: 3, dtype: float64


In [21]:
x = data.iloc[3:5, :]
y = data.iloc[:, 2:5]

print(x)
print(y)

    SEQN  ALQ101  ALQ110  ALQ130  SMQ020  RIAGENDR  RIDAGEYR  RIDRETH1  \
3  83735     2.0     1.0     1.0       2         2        56         3   
4  83736     2.0     1.0     1.0       2         2        42         4   

   DMDCITZN  DMDEDUC2  ...  BPXSY2  BPXDI2  BMXWT  BMXHT  BMXBMI  BMXLEG  \
3       1.0       5.0  ...   134.0    68.0  109.8  160.9    42.4    38.5   
4       1.0       4.0  ...   114.0    54.0   55.2  164.9    20.3    37.4   

   BMXARML  BMXARMC  BMXWAIST  HIQ210  
3     37.7     38.3     110.1     2.0  
4     36.0     27.2      80.4     2.0  

[2 rows x 28 columns]
      ALQ110  ALQ130  SMQ020
0        NaN     1.0       1
1        NaN     6.0       1
2        NaN     NaN       1
3        1.0     1.0       2
4        1.0     1.0       2
...      ...     ...     ...
5730     2.0     NaN       1
5731     2.0     NaN       2
5732     NaN     1.0       1
5733     NaN     NaN       1
5734     NaN     2.0       2

[5735 rows x 3 columns]


## Missing valuees

In [23]:
print(data.isnull().sum())

SEQN           0
ALQ101       527
ALQ110      4004
ALQ130      2356
SMQ020         0
RIAGENDR       0
RIDAGEYR       0
RIDRETH1       0
DMDCITZN       1
DMDEDUC2     261
DMDMARTL     261
DMDHHSIZ       0
WTINT2YR       0
SDMVPSU        0
SDMVSTRA       0
INDFMPIR     601
BPXSY1       334
BPXDI1       334
BPXSY2       200
BPXDI2       200
BMXWT         69
BMXHT         62
BMXBMI        73
BMXLEG       390
BMXARML      308
BMXARMC      308
BMXWAIST     367
HIQ210      1003
dtype: int64


In [26]:
print(data.notnull().sum())

SEQN        5735
ALQ101      5208
ALQ110      1731
ALQ130      3379
SMQ020      5735
RIAGENDR    5735
RIDAGEYR    5735
RIDRETH1    5735
DMDCITZN    5734
DMDEDUC2    5474
DMDMARTL    5474
DMDHHSIZ    5735
WTINT2YR    5735
SDMVPSU     5735
SDMVSTRA    5735
INDFMPIR    5134
BPXSY1      5401
BPXDI1      5401
BPXSY2      5535
BPXDI2      5535
BMXWT       5666
BMXHT       5673
BMXBMI      5662
BMXLEG      5345
BMXARML     5427
BMXARMC     5427
BMXWAIST    5368
HIQ210      4732
dtype: int64


In [25]:
print(pd.isnull(data.DMDEDUC2).sum())

261


In [27]:
print(pd.notnull(data.DMDEDUC2).sum())

5474
