In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100) # Show all cols 

In [3]:
df = pd.read_csv('nhanes_2015_2016.csv')

In [4]:
df.head()

Unnamed: 0,SEQN,ALQ101,ALQ110,ALQ130,SMQ020,RIAGENDR,RIDAGEYR,RIDRETH1,DMDCITZN,DMDEDUC2,DMDMARTL,DMDHHSIZ,WTINT2YR,SDMVPSU,SDMVSTRA,INDFMPIR,BPXSY1,BPXDI1,BPXSY2,BPXDI2,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST,HIQ210
0,83732,1.0,,1.0,1,1,62,3,1.0,5.0,1.0,2,134671.37,1,125,4.39,128.0,70.0,124.0,64.0,94.8,184.5,27.8,43.3,43.6,35.9,101.1,2.0
1,83733,1.0,,6.0,1,1,53,3,2.0,3.0,3.0,1,24328.56,1,125,1.32,146.0,88.0,140.0,88.0,90.4,171.4,30.8,38.0,40.0,33.2,107.9,
2,83734,1.0,,,1,1,78,3,1.0,3.0,1.0,2,12400.01,1,131,1.51,138.0,46.0,132.0,44.0,83.4,170.1,28.8,35.6,37.0,31.0,116.5,2.0
3,83735,2.0,1.0,1.0,2,2,56,3,1.0,5.0,6.0,1,102718.0,1,131,5.0,132.0,72.0,134.0,68.0,109.8,160.9,42.4,38.5,37.7,38.3,110.1,2.0
4,83736,2.0,1.0,1.0,2,2,42,4,1.0,4.0,3.0,5,17627.67,2,126,1.23,100.0,70.0,114.0,54.0,55.2,164.9,20.3,37.4,36.0,27.2,80.4,2.0


In [5]:
col_names = df.columns
col_names

Index(['SEQN', 'ALQ101', 'ALQ110', 'ALQ130', 'SMQ020', 'RIAGENDR', 'RIDAGEYR',
       'RIDRETH1', 'DMDCITZN', 'DMDEDUC2', 'DMDMARTL', 'DMDHHSIZ', 'WTINT2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDFMPIR', 'BPXSY1', 'BPXDI1', 'BPXSY2',
       'BPXDI2', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC',
       'BMXWAIST', 'HIQ210'],
      dtype='object')

In [8]:
# One way to get the column names we want to keep is simply by copying from the above output and storing in a list
keep = ['BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC', 'BMXWAIST']
keep

['BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC', 'BMXWAIST']

In [10]:
# Another way to get only column names that include 'BMX' is with list comprehension
keep = [column for column in df.columns if 'BMX' in column]
keep

['BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC', 'BMXWAIST']

In [11]:
df_bmx = df[keep]

In [12]:
df_bmx.head()

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
0,94.8,184.5,27.8,43.3,43.6,35.9,101.1
1,90.4,171.4,30.8,38.0,40.0,33.2,107.9
2,83.4,170.1,28.8,35.6,37.0,31.0,116.5
3,109.8,160.9,42.4,38.5,37.7,38.3,110.1
4,55.2,164.9,20.3,37.4,36.0,27.2,80.4


In [13]:
df.loc[:, keep].head()

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
0,94.8,184.5,27.8,43.3,43.6,35.9,101.1
1,90.4,171.4,30.8,38.0,40.0,33.2,107.9
2,83.4,170.1,28.8,35.6,37.0,31.0,116.5
3,109.8,160.9,42.4,38.5,37.7,38.3,110.1
4,55.2,164.9,20.3,37.4,36.0,27.2,80.4


In [14]:
index_bool = np.isin(df.columns, keep)
index_bool

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False])

In [15]:
# Custom test
custom = np.isin(keep, df.columns)
custom

array([ True,  True,  True,  True,  True,  True,  True])

In [16]:
# Indexing with boolean list
df.iloc[:, index_bool].head()

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
0,94.8,184.5,27.8,43.3,43.6,35.9,101.1
1,90.4,171.4,30.8,38.0,40.0,33.2,107.9
2,83.4,170.1,28.8,35.6,37.0,31.0,116.5
3,109.8,160.9,42.4,38.5,37.7,38.3,110.1
4,55.2,164.9,20.3,37.4,36.0,27.2,80.4


# Selection by conditions

In [18]:
# Lets only look at rows who 'BMXWAIST' is larger than the median
waist_median = pd.Series.median(df_bmx['BMXWAIST'])
waist_median

98.3

In [19]:
# Custom
df_bmx['BMXWAIST'].median()

98.3

In [20]:
df_bmx[df_bmx['BMXWAIST'] > waist_median].head()

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
0,94.8,184.5,27.8,43.3,43.6,35.9,101.1
1,90.4,171.4,30.8,38.0,40.0,33.2,107.9
2,83.4,170.1,28.8,35.6,37.0,31.0,116.5
3,109.8,160.9,42.4,38.5,37.7,38.3,110.1
9,108.3,179.4,33.6,46.0,44.1,38.5,116.0


In [21]:
# Lets add another condition, that 'BMXLEG' must be less than 32
condition1 = df_bmx['BMXWAIST'] > waist_median
condition2 = df_bmx['BMXLEG'] < 32
df_bmx[condition1 & condition2].head()
# Note: can't use 'and' instead of '&'

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
15,80.5,150.8,35.4,31.6,32.7,33.7,113.5
27,75.6,145.2,35.9,31.0,33.1,36.0,108.0
39,63.7,147.9,29.1,26.0,34.0,31.5,110.0
52,105.9,157.7,42.6,29.2,35.0,40.7,129.1
55,77.5,148.3,35.2,30.5,34.0,34.4,107.6


In [23]:
df_bmx.loc[condition1 & condition2, :].head() # Using df.loc[] method
# note that the conditions are describing the rows to keep

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
15,80.5,150.8,35.4,31.6,32.7,33.7,113.5
27,75.6,145.2,35.9,31.0,33.1,36.0,108.0
39,63.7,147.9,29.1,26.0,34.0,31.5,110.0
52,105.9,157.7,42.6,29.2,35.0,40.7,129.1
55,77.5,148.3,35.2,30.5,34.0,34.4,107.6


Lets make a small dataframe and give it a new index so can more clearly see the differences between .loc and .iloc

In [28]:
tmp = df_bmx.loc[condition1 & condition2, :].head()
tmp

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
15,80.5,150.8,35.4,31.6,32.7,33.7,113.5
27,75.6,145.2,35.9,31.0,33.1,36.0,108.0
39,63.7,147.9,29.1,26.0,34.0,31.5,110.0
52,105.9,157.7,42.6,29.2,35.0,40.7,129.1
55,77.5,148.3,35.2,30.5,34.0,34.4,107.6


In [30]:
tmp.index = ['a', 'b', 'c', 'd', 'e']
tmp

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
a,80.5,150.8,35.4,31.6,32.7,33.7,113.5
b,75.6,145.2,35.9,31.0,33.1,36.0,108.0
c,63.7,147.9,29.1,26.0,34.0,31.5,110.0
d,105.9,157.7,42.6,29.2,35.0,40.7,129.1
e,77.5,148.3,35.2,30.5,34.0,34.4,107.6


In [33]:
tmp.loc[['a','b'], 'BMXLEG']

a    31.6
b    31.0
Name: BMXLEG, dtype: float64

In [34]:
tmp.iloc[[0,1], 3]

a    31.6
b    31.0
Name: BMXLEG, dtype: float64

## Common errors and how to read them

In [35]:
tmp[:, 'BMXBMI']

InvalidIndexError: (slice(None, None, None), 'BMXBMI')

### Problem
The above gives: TypeError: unhashable type: 'slice' 

The [ ] method uses hashes to identify the columns to keep, and each column has an associated hash. A 'slice' (a subset of rows and columns) does not have an associated hash, thus causing this TypeError.

In [36]:
tmp.loc[:, 'BMXBMI']

a    35.4
b    35.9
c    29.1
d    42.6
e    35.2
Name: BMXBMI, dtype: float64

In [37]:
tmp.loc[:, 'BMXBMI'].values

array([35.4, 35.9, 29.1, 42.6, 35.2])

In [38]:
tmp.iloc[:, 'BMXBMI']

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

### Problem
The above gives: ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

'BMXBMI' is not an integer that is less than or equal number of columns -1, or a list of boolean values, so it is the wrong value type. 

In [39]:
tmp.iloc[:, 2]

a    35.4
b    35.9
c    29.1
d    42.6
e    35.2
Name: BMXBMI, dtype: float64

In [40]:
tmp.loc[:, 2]

KeyError: 2

### Problem
The above code gives: ```TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [2] of <class 'int'>```

2 is not one of the labels (i.e. column names) in the dataframe

In [41]:
# Here is another example of using a boolean list for indexing columns
tmp.loc[:, [False, False, True] +[False]*4]

Unnamed: 0,BMXBMI
a,35.4
b,35.9
c,29.1
d,42.6
e,35.2


In [43]:
[False]*4

[False, False, False, False]

In [44]:
[False, False, True] +[False]*4

[False, False, True, False, False, False, False]

In [45]:
tmp.iloc[:, 2]

a    35.4
b    35.9
c    29.1
d    42.6
e    35.2
Name: BMXBMI, dtype: float64

In [46]:
# We can use the .loc and .iloc methods to change values within the dataframe
tmp.iloc[0:3,2] = [0]*3
tmp.iloc[:,2]

a     0.0
b     0.0
c     0.0
d    42.6
e    35.2
Name: BMXBMI, dtype: float64

In [47]:
[0]*3

[0, 0, 0]

In [48]:
tmp.loc['a':'c','BMXBMI'] = [1]*3
tmp.loc[:, 'BMXBMI']

a     1.0
b     1.0
c     1.0
d    42.6
e    35.2
Name: BMXBMI, dtype: float64

In [50]:
# We can use the [] method when changing all the values of a column
tmp['BMXBMI'] = range(0, 5)
tmp

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
a,80.5,150.8,0,31.6,32.7,33.7,113.5
b,75.6,145.2,1,31.0,33.1,36.0,108.0
c,63.7,147.9,2,26.0,34.0,31.5,110.0
d,105.9,157.7,3,29.2,35.0,40.7,129.1
e,77.5,148.3,4,30.5,34.0,34.4,107.6


We will get a warning when using the [] method with conditions to set new values in our dataframe

In [53]:
tmp[tmp.BMXBMI > 2]['BMXBMI'] = [10]*2
tmp
# You can see that the above code did not change our dataframe 'tmp'. This

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp[tmp.BMXBMI > 2]['BMXBMI'] = [10]*2


Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
a,80.5,150.8,0,31.6,32.7,33.7,113.5
b,75.6,145.2,1,31.0,33.1,36.0,108.0
c,63.7,147.9,2,26.0,34.0,31.5,110.0
d,105.9,157.7,3,29.2,35.0,40.7,129.1
e,77.5,148.3,4,30.5,34.0,34.4,107.6


In [54]:
# The correct way to do the above is with .loc or .iloc
tmp.loc[tmp.BMXBMI > 2, 'BMXBMI']  = [10]*2
tmp # Now contains the chances

Unnamed: 0,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST
a,80.5,150.8,0,31.6,32.7,33.7,113.5
b,75.6,145.2,1,31.0,33.1,36.0,108.0
c,63.7,147.9,2,26.0,34.0,31.5,110.0
d,105.9,157.7,10,29.2,35.0,40.7,129.1
e,77.5,148.3,10,30.5,34.0,34.4,107.6
