# Learning objectives
- Locate files stored on Google Drive from within a Colab notebook
- Load epidemiological datasets into a Pandas DataFrame
- Inspect the structure and content of real-world data
- Explore variable types and distributions of data
- Practice basic data exploration techniques

In [2]:
import numpy as np
import pandas as pd

# Load the dataset

In [4]:
import os

In [6]:
# Make Google Drive available to the script
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
os.path.realpath('')

'/content'

In [15]:
os.listdir('drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data')

['frmgham2.csv']

In [16]:
# Let's load the Framingham Heart Study dataset
filename = 'drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data/frmgham2.csv'
frame = pd.read_csv(filename)
frame.head()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
0,2448,1,195.0,39,106.0,70.0,0,0.0,26.97,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
1,2448,1,209.0,52,121.0,66.0,0,0.0,,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,6238,2,250.0,46,121.0,81.0,0,0.0,28.73,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
3,6238,2,260.0,52,105.0,69.5,0,0.0,29.43,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,2,237.0,58,108.0,66.0,0,0.0,28.5,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766


# Inspect the data structure

In [17]:
# Check the dimensions of the dataframe
frame.shape

(11627, 39)

In [18]:
# List column names
frame.columns

Index(['RANDID', 'SEX', 'TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'CURSMOKE',
       'CIGPDAY', 'BMI', 'DIABETES', 'BPMEDS', 'HEARTRTE', 'GLUCOSE', 'educ',
       'PREVCHD', 'PREVAP', 'PREVMI', 'PREVSTRK', 'PREVHYP', 'TIME', 'PERIOD',
       'HDLC', 'LDLC', 'DEATH', 'ANGINA', 'HOSPMI', 'MI_FCHD', 'ANYCHD',
       'STROKE', 'CVD', 'HYPERTEN', 'TIMEAP', 'TIMEMI', 'TIMEMIFC', 'TIMECHD',
       'TIMESTRK', 'TIMECVD', 'TIMEDTH', 'TIMEHYP'],
      dtype='object')

In [19]:
# List variable types
frame.dtypes

Unnamed: 0,0
RANDID,int64
SEX,int64
TOTCHOL,float64
AGE,int64
SYSBP,float64
DIABP,float64
CURSMOKE,int64
CIGPDAY,float64
BMI,float64
DIABETES,int64


In [20]:
# List columns with information
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11627 entries, 0 to 11626
Data columns (total 39 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   RANDID    11627 non-null  int64  
 1   SEX       11627 non-null  int64  
 2   TOTCHOL   11218 non-null  float64
 3   AGE       11627 non-null  int64  
 4   SYSBP     11627 non-null  float64
 5   DIABP     11627 non-null  float64
 6   CURSMOKE  11627 non-null  int64  
 7   CIGPDAY   11548 non-null  float64
 8   BMI       11575 non-null  float64
 9   DIABETES  11627 non-null  int64  
 10  BPMEDS    11034 non-null  float64
 11  HEARTRTE  11621 non-null  float64
 12  GLUCOSE   10187 non-null  float64
 13  educ      11332 non-null  float64
 14  PREVCHD   11627 non-null  int64  
 15  PREVAP    11627 non-null  int64  
 16  PREVMI    11627 non-null  int64  
 17  PREVSTRK  11627 non-null  int64  
 18  PREVHYP   11627 non-null  int64  
 19  TIME      11627 non-null  int64  
 20  PERIOD    11627 non-null  in

# Explore the content of a dataframe

In [21]:
# Summary statistics of all variables
frame.describe()

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
count,11627.0,11627.0,11218.0,11627.0,11627.0,11627.0,11627.0,11548.0,11575.0,11627.0,...,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0
mean,5004741.0,1.568074,241.162418,54.79281,136.324116,83.037757,0.432528,8.250346,25.877349,0.045584,...,0.249333,0.74327,7241.556893,7593.846736,7543.036725,7008.153608,7660.880021,7166.082996,7854.10295,3598.956395
std,2900877.0,0.495366,45.36803,9.564299,22.798625,11.660144,0.495448,12.186888,4.10264,0.208589,...,0.432646,0.436848,2477.78001,2136.730285,2192.120311,2641.344513,2011.077091,2541.668477,1788.369623,3464.164659
min,2448.0,1.0,107.0,32.0,83.5,30.0,0.0,0.0,14.43,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0
25%,2474378.0,1.0,210.0,48.0,120.0,75.0,0.0,0.0,23.095,0.0,...,0.0,0.0,6224.0,7212.0,7049.5,5598.5,7295.0,6004.0,7797.5,0.0
50%,5006008.0,2.0,238.0,54.0,132.0,82.0,0.0,0.0,25.48,0.0,...,0.0,1.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,2429.0
75%,7472730.0,2.0,268.0,62.0,149.0,90.0,1.0,20.0,28.07,0.0,...,0.0,1.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,7329.0
max,9999312.0,2.0,696.0,81.0,295.0,150.0,1.0,90.0,56.8,1.0,...,1.0,1.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0


In [22]:
# Value counts on the levels of a categorical variable
frame['SEX'].value_counts()

Unnamed: 0_level_0,count
SEX,Unnamed: 1_level_1
2,6605
1,5022


In [25]:
# Value counts on the levels of AGE:
pd.DataFrame(frame['AGE'].value_counts()).sort_values('AGE')

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
32,1
33,5
34,18
35,42
36,84
37,93
38,145
39,178
40,212
41,213
