In [9]:
import json
import requests

In [18]:
# Establish accounts. Fill out variables here as needed. i.e. project/awardee/pmi_account/service_account
# Once you save this file, you don't have to retype these.

project = "all-of-us-rdr-stable" # type in environment if not stable
awardee = "TEST" # fill in your awardee
pmi_account = "henry.walker@pmi-ops.org" # update your pmi-ops account.
service_account = "awardee-test@all-of-us-ops-data-api-stable.iam.gserviceaccount.com" #update to your service account

In [19]:
# Authentication. This cell creates a key for your service account, you don't need to do anything.
# If you are running this notebook on your local system (the legacy method) you can un-comment the following line to create
# a service account key that is called 'gcloud_key.json'. Otherwise you should have this key in the file structure already.
results = !gcloud -q iam service-accounts keys create --account $pmi_account --project $project --iam-account $service_account gcloud_key.json
!gcloud -q auth activate-service-account --key-file=gcloud_key.json

Activated service account credentials for: [awardee-test@all-of-us-ops-data-api-stable.iam.gserviceaccount.com]


In [20]:
# Get Oauth Token, refresh as needed.
token = !gcloud -q auth print-access-token
token = token[0]
headers = {'content-type': 'application/json', 'Authorization': 'Bearer {0}'.format(token)}
print('Authentication Token Ready!') if token.startswith('ya') else 'Authentication Token Error!'

Authentication Token Ready!


In [21]:
# Make request to get API version. This is the current RDR version for reference
import requests
resp = requests.get('http://all-of-us-rdr-prod.appspot.com/rdr/v1/', headers=headers)
print(resp.json())

{'version_id': '1-148-3'}


In [22]:
import sys

def pull_api_data(headers, url):
    resp = requests.get(url, headers=headers)
 #   print(f"Response2 : {resp} from {url}")
    ps_data = None
    if not resp or resp.status_code != 200:
        print('Error: api request failed.\n\n{0}.'.format(resp.text if resp else 'Unknown error.'))
        print("Did you run  gcloud -q auth print-access-token ? ")
        sys.exit(1)
    else:
        ps_data = resp.json()
    return ps_data

In [23]:
from datetime import datetime
import pandas

# Set the # of participants that will be pulled in each batch.

count = 1000

# Set the URL variable. You can change parameters of the url as needed.
# See https://github.com/all-of-us/raw-data-repository/blob/master/opsdataAPI.md for documentation of this api.

url = 'https://{0}.appspot.com/rdr/v1/ParticipantSummary?_sort=lastModified&organization=SOUTHERN_UAB&_includeTotal=true&_count={1}&awardee={2}'.format(project, count, awardee)

# Call pull_api_data function and store the result in variable "ps_data"

ps_data = pull_api_data(headers, url)

# The variable "link_data" is set to the value stored in the "link" array in the first index of ps_data (which is a json file)

link_data = ps_data["link"][0]



# The variable "total" stores the total number of records that match your query based on the parameters you set in the URL.
# Just to note, the parameter _includeTotal=true will need to be included in the URL
total = ps_data["total"]
num_of_batches = total//count


# The variable "next_url" is set to the value of "url" in the link_data array
next_url = link_data['url']

print("There are {0} records for your query.....retrieving data in {1} batches\n".format(total, num_of_batches))
    # print("LINK")
#print(link_data)
#print(f"Next relation : {link_data['relation']} url : {next_url} ")

batch = 1

data = []
while next_url is not None:
    print(f"Retrieving next set of records ... batch  = {batch}")
    
    good_cols = ['ageRange', 'dateOfBirth', 'participantId', 'race', 'sex', 'email', 'site']


# loop over participant summary records, insert participant data in same order as good_cols.
    for entry in ps_data['entry']:
        item = []
        for col in good_cols:
            for key, val in entry['resource'].items():
                if col == key:
                    if key == 'dateOfBirth':                    
                        item.append(datetime.strptime(val, '%Y-%m-%d'))
                    else:
                        item.append(val)    
        data.append(item)
           # insert_postgres(cur, ps_data)
           # conn.commit()
            # get next set of patient data based on the next url found within the first result set
    frame = pandas.DataFrame(data, columns=good_cols)
    if next_url:
            ps_data = pull_api_data(headers, next_url)
    try:
        link_data = ps_data["link"][0]
        next_url = link_data['url']
    except KeyError:
        break
    batch += 1
resp = requests.get(next_url, headers=headers)
if not resp or resp.status_code != 200:
    print('Error: api request failed.\n\n{0}.'.format(resp.text if resp else 'Unknown error.'))
else:
    ps_data = resp.json()
    print('Retrieving last batch....')
    print('\nSuccess: retrieved {0} records.'.format(count*batch+len(ps_data['entry'])))
for entry in ps_data['entry']:
        item = []
        for col in good_cols:
            for key, val in entry['resource'].items():
                if col == key:
                    if key == 'dateOfBirth':                    
                        item.append(datetime.strptime(val, '%Y-%m-%d'))
                    else:
                        item.append(val)    
        data.append(item)
           # insert_postgres(cur, ps_data)
           # conn.commit()
            # get next set of patient data based on the next url found within the first result set
frame = pandas.DataFrame(data, columns=good_cols)

There are 3204 records for your query.....retrieving data in 3 batches

Retrieving next set of records ... batch  = 1
Retrieving next set of records ... batch  = 2
Retrieving next set of records ... batch  = 3
Retrieving last batch....

Success: retrieved 3204 records.


In [24]:
display(frame)
frame.to_csv(r'C:\Users\walkerhp\Documents\Ops Data API\Output.csv', index=False)

Unnamed: 0,ageRange,dateOfBirth,participantId,race,sex,email,site
0,85-,1902-10-10 00:00:00,P145474538,WHITE,SexAtBirth_Male,VQA3+DEV+vv9lxfc3d48g7cithv@vibrenthealthtest.com,UNSET
1,18-24,2000-09-15 00:00:00,P754497553,UNSET,UNSET,VibQA3+DEV+5psbphwd7ygx5iox@gmail.com,UNSET
2,85-,1933-03-03 00:00:00,P838000487,PMI_Skip,SexAtBirth_Female,HudsonKing@fakeexample.com,hpo-site-generalgenomicstesting
3,18-24,2000-09-15 00:00:00,P212638360,UNSET,UNSET,VibQA3+DEV+sib0gfp95ejat9my@gmail.com,UNSET
4,85-,1933-03-03 00:00:00,P726051846,PMI_Skip,SexAtBirth_Female,AriaMorgan@fakeexample.com,hpo-site-generalgenomicstesting
...,...,...,...,...,...,...,...
3199,85-,1933-01-01 00:00:00,P946449025,PMI_Skip,PMI_Skip,participant9_testvirtualsite@example.com,testvirtualsite
3200,85-,1933-01-01 00:00:00,P789078173,PMI_Skip,PMI_Skip,participant0_testvirtualsite@example.com,testvirtualsite
3201,35-44,1988-02-08 00:00:00,P553118358,MORE_THAN_ONE_RACE,SexAtBirth_Female,ptsc.test.persona+49@gmail.com,hpo-site-a
3202,35-44,1983-09-24 00:00:00,P494083999,PMI_Skip,PMI_Skip,vibrenttester27+293483@gmail.com,hpo-site-a


In [44]:
# Deletes the google cloud key created in first step so that you don't hit the Google enforced limit of 10 keys.
# We don't typically need this because we no longer create new keys each time it's run.
#import os
#os.remove('gcloud_key.json')
#!gcloud -q iam service-accounts keys delete $key_id --account $pmi_account --iam-account $service_account

### NOTES:
* Cells will be added/modified by the dev team while working with data stewards to determine specific needs.
* You are welcome to add cells to view info in different ways if you're comfortable with Python
    * if you ever want to return this notebook to it's original state type `git checkout -- ops_data_api.ipynb` from the ops_data_api directory.