# Learning goals
- Filter a dataset to access a specific subgroup of data
- Use different methods for slicing a Pandas DataFrame
- Apply functions to a subgroup or slice of a DataFrame

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# Make Google Drive available to the script
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Let's load the Framingham Heart Study dataset
filename = 'drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data/frmgham2.csv'
frame = pd.read_csv(filename)
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766


# Demographics (continued)

In [4]:
pd.DataFrame(frame['AGE'].value_counts()).sort_values('AGE')

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
32,1
33,5
34,18
35,42
36,84
37,93
38,145
39,178
40,212
41,213


In [6]:
# Create groupings of age
frame['age_group'] = pd.cut(frame['AGE'], bins = [25, 35, 45, 55, 65, 75, 85])
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,age_group
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,"(35, 45]"
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,0,8766,6438,6438,6438,8766,6438,8766,8766,"(45, 55]"
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,"(45, 55]"
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,"(45, 55]"
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,8766,8766,8766,8766,8766,8766,8766,8766,"(55, 65]"


In [8]:
pd.DataFrame(frame['age_group'].value_counts()).sort_values('age_group')

Unnamed: 0_level_0,count
age_group,Unnamed: 1_level_1
"(25, 35]",66
"(35, 45]",2082
"(45, 55]",4155
"(55, 65]",3592
"(65, 75]",1600
"(75, 85]",132


In [9]:
frame['educ'].value_counts()

Unnamed: 0_level_0,count
educ,Unnamed: 1_level_1
1.0,4690
2.0,3410
3.0,1885
4.0,1347


# Outcomes

In [10]:
frame['PREVCHD'].value_counts()

Unnamed: 0_level_0,count
PREVCHD,Unnamed: 1_level_1
0,10785
1,842


In [16]:
pd.DataFrame(frame[['age_group','PREVCHD']].value_counts()).sort_values('PREVCHD', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
age_group,PREVCHD,Unnamed: 2_level_1
"(45, 55]",1,158
"(65, 75]",1,273
"(55, 65]",1,365
"(35, 45]",1,23
"(75, 85]",1,23
"(45, 55]",0,3997
"(55, 65]",0,3227
"(35, 45]",0,2059
"(65, 75]",0,1327
"(25, 35]",0,66


# Longitudinal observations per person

In [19]:
len(frame['RANDID'].unique())

4434

In [20]:
# Observations versus nr of participants
print('This dataset has %i rows (observations) and %i unique individuals.'%(
    frame.shape[0], len(frame['RANDID'].unique()))
)

This dataset has 11627 rows (observations) and 4434 unique individuals.


# Slicing and indexing dataframes

In [21]:
frame.loc[1:5, ['SEX','AGE']]

Unnamed: 0,SEX,AGE
1,1,52
2,2,46
3,2,52
4,2,58
5,1,48


In [22]:
# Extract rows 6 through 10 for columns AGE and TOTCHOL
frame.loc[6:10, ['AGE','TOTCHOL']]

Unnamed: 0,AGE,TOTCHOL
6,54,283.0
7,61,225.0
8,67,232.0
9,46,285.0
10,51,343.0


In [23]:
frame.iloc[6:10,1:4]

Unnamed: 0,SEX,TOTCHOL,AGE
6,1,283.0,54
7,2,225.0,61
8,2,232.0,67
9,2,285.0,46


### Slicing with booleans

In [24]:
# How can we get ONLY the data for participants of 80 years old?
frame.loc[frame['AGE']==80, :]

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,age_group
3144,2693641,1,175.0,80,106.0,56.0,1,2.0,18.29,0,...,0,4470,4470,4470,4470,4470,4470,4470,4470,"(75, 85]"
7333,6343336,2,179.0,80,159.0,65.0,0,0.0,20.78,0,...,1,8016,8016,8016,8016,8016,8016,8016,735,"(75, 85]"
7489,6494685,2,213.0,80,120.0,49.0,0,0.0,21.8,0,...,1,1784,5589,5589,1784,5589,5589,5589,0,"(75, 85]"
10070,8723664,2,215.0,80,188.0,88.0,0,0.0,19.01,1,...,1,7359,7359,7359,7359,7359,7359,7359,0,"(75, 85]"
10554,9093148,1,217.0,80,162.0,74.0,0,0.0,31.78,0,...,1,5206,5206,5206,5206,5206,5206,5206,719,"(75, 85]"
10683,9199095,1,264.0,80,140.0,90.0,0,0.0,27.93,0,...,0,6568,6568,6568,6568,6568,6568,6568,6568,"(75, 85]"


In [26]:
person_of_interest = 8723664
person_data = frame.loc[frame['RANDID']==person_of_interest, :]
person_data.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP,age_group
10068,8723664,2,203.0,69,166.0,90.0,0,0.0,25.4,0,...,1,7359,7359,7359,7359,7359,7359,7359,0,"(65, 75]"
10069,8723664,2,224.0,75,195.0,98.0,0,0.0,22.51,0,...,1,7359,7359,7359,7359,7359,7359,7359,0,"(65, 75]"
10070,8723664,2,215.0,80,188.0,88.0,0,0.0,19.01,1,...,1,7359,7359,7359,7359,7359,7359,7359,0,"(75, 85]"
