# Creating a core measure data product

Will need to download a file manifest and have a credentials.json

## Downloading selecting files from JDC

In [1]:
from pathlib import Path
import json
from gen3 import auth,file
import requests

In [17]:
# manifest_path = "../../promis-test-file-manifest.json"
manifest_path = "file-manifest.json"

In [18]:
# download files (get file-manifest.json from explorer UI)
gen3auth = auth.Gen3Auth(refresh_file="../../credentials.json")
gen3file = file.Gen3File(gen3auth)
manifest = json.loads(Path(manifest_path).read_text())
no_guids = []
Path("tmp").mkdir(exist_ok=True)
for params in manifest:
    fileguid = params.get("object_id")
    if fileguid:
        url = gen3file.get_presigned_url(fileguid)
        pkg_bytes = requests.get(url()["url"]).content
        Path("tmp/"+params["file_name"]).write_bytes(pkg_bytes)

[2024-01-22 22:42:07,509][  ERROR] backoff: gave up call gen3.auth._write_to_file(<gen3.auth.Gen3Auth object at 0x0000018ED3F8A380>, C:\Users\kranz-michael/.cache/gen3/token_cache_59a84d7ef796aef44d95e557d46cc075, eyJhbGciOiJSUzI1NiIsImtpZCI6ImZlbmNlX2tleV8yMDIwLTAxLTAyVDA2OjM1OjA4WiIsInR5cCI6IkpXVCJ9.eyJwdXIiOiJhY2Nlc3MiLCJpc3MiOiJodHRwczovL2pjb2luLmRhdGFjb21tb25zLmlvL3VzZXIiLCJhdWQiOlsiaHR0cHM6Ly9qY29pbi5kYXRhY29tbW9ucy5pby91c2VyIiwib3BlbmlkIiwiZ29vZ2xlX2NyZWRlbnRpYWxzIiwidXNlciIsImFkbWluIiwiZmVuY2UiLCJnb29nbGVfbGluayIsImRhdGEiLCJnb29nbGVfc2VydmljZV9hY2NvdW50Il0sImlhdCI6MTcwNTk4NDkxOSwiZXhwIjoxNzA1OTg4NTE5LCJqdGkiOiJhZDIyNDE5NS0zODZhLTRhMWYtOGE4Zi1hOTZiMjZmYjhmYTciLCJzY29wZSI6WyJvcGVuaWQiLCJnb29nbGVfY3JlZGVudGlhbHMiLCJ1c2VyIiwiYWRtaW4iLCJmZW5jZSIsImdvb2dsZV9saW5rIiwiZGF0YSIsImdvb2dsZV9zZXJ2aWNlX2FjY291bnQiXSwiY29udGV4dCI6eyJ1c2VyIjp7Im5hbWUiOiJtYmtyYW56QGdtYWlsLmNvbSIsImlzX2FkbWluIjpmYWxzZSwiZ29vZ2xlIjp7InByb3h5X2dyb3VwIjpudWxsfX19LCJhenAiOiIiLCJzdWIiOiI3OSJ9.uwzqmxKt9vyOz2urUOl8GH5e

## Creating local data product with specified variables

In [3]:
import pandas as pd
from jdc_utils import CoreMeasures
import jdc_utils
from pathlib import Path
import json
import zipfile
from frictionless import Resource,steps
import random

In [4]:
manifest_path = "file-manifest.json"
manifest = json.loads(Path(manifest_path).read_text())
pkgs = list(Path("tmp/").glob("core-measures*.zip"))


# shuffle with set state for reproducibility 
# NOTE: next iteration will have these in individual files
hub_id_list = list(range(len(manifest))) 
random.Random(4).shuffle(hub_id_list)

In [5]:
hub_id_list

[3, 5, 4, 0, 2, 1]

In [6]:
baselines = []
timepoints = []
for i,params in enumerate(manifest):
    # NOTE: with froctionless Package with zip got error:
    ## FrictionlessException: [package-error] The data package has an error: 
    ## cannot extract metadata "C:\Users\KRANZ-~1\AppData\Local\Temp\
    # tmp3oia90uh\datapackage.json" because "[Errno 2] No such file or directory: 
    # 'C:\\Users\\KRANZ-~1\\AppData\\Local\\Temp\\tmp3oia90uh\\datapackage.json'"
    pkg = "tmp/"+params["file_name"]
    report = zipfile.Path(pkg).joinpath('report.json').read_text()
    is_valid = json.loads(report)["valid"]
    baseline_file = zipfile.Path(pkg).joinpath('data/baseline.csv').read_bytes()
    timepoints_file = zipfile.Path(pkg).joinpath('data/timepoints.csv').read_bytes()

    baseline_df = Resource(baseline_file,format="csv").to_petl().todf()
    timepoints_df = Resource(timepoints_file,format="csv").to_petl().todf()
    baseline_df["hub_id"] = hub_id_list[i]


    baselines.append(baseline_df)
    timepoints.append(timepoints_df)

core_measures = CoreMeasures(transform_steps=[]) #get config params
core_measures.add_baseline(pd.concat(baselines))
core_measures.add_timepoints(pd.concat(timepoints))

In [10]:
#filter modules and add annotation about filtering and description

version = "0.1.0"
README = """
This data package was structured in accordance with the (data package standards (i.e., frictionless))[https://specs.frictionlessdata.io/].

A Data Package is a simple container format used to describe and package a collection of data (a dataset). 

For tabular data, the data package standard consists of:
1. resource, or file-level metadata (e.g., the `path` to the files in this package)
2. table, or schema-level metadata (e.g, the `fields` or variables in the file)

See the metadata describing this in `data-package.json`.

- Note, for python and R users, software toolkits make the data and metadata easy to use (but feel free to read in using any tool):

    1. [`frictionless-R`](https://github.com/frictionlessdata/frictionless-r): an R implementation of the data package standard
    2. [`frictionless-py`](https://github.com/frictionlessdata/frictionless-py): a python implementation of the data package standard

- For SPSS, Stata, and SAS users, see the `sav` and `dta` versions of each data file.

For values that do not conform to the schema (e.g., missing values), see `report-summary.txt` (and the `report.json`). For more
information on reports, [click on each of the `Data Error` sections here](https://framework.frictionlessdata.io/docs/errors/header.html)


"""

In [11]:
baseline_sections = ["Record and linkage","Demographics"]
timepoints_sections = ["Record and linkage","PROMIS 29+2/ PROPr"]
included_sections = {"baseline":baseline_sections,"timepoints":timepoints_sections}
excluded_names = ["quarter_enrolled","current_study_status"]
targetresources = []
for name in core_measures.package.resource_names:
    source = core_measures.package.get_resource(name)
    fields = source.schema["fields"]
    fieldnames = [
        field["name"] for field in fields 
        if field["section"] in included_sections[name]
        and not field["name"] in excluded_names]
    target = source.transform(steps=[steps.field_filter(names=fieldnames)])
    targetresources.append(target)

sections_str = ""
for resourcename in included_sections:
    sections_str += f"**{resourcename}**\n\n"
    for sectionname in included_sections[resourcename]:
        sections_str += "- " + sectionname + "\n"
    sections_str += "\n"

excluded_names_str = "\n".join(excluded_names)

core_measures.package.resources = targetresources

In [12]:
# write subsetted package to file
package_name = "core-measures-promis"

In [13]:
core_measures.write(outdir=package_name)

<jdc_utils.core_measures.core_measures.CoreMeasures at 0x25374426f80>

In [16]:
# add currently missing elements to data package
path = Path(package_name+"/data-package.json")
datapackage = json.loads(path.read_text())
new_datapackage = {}
readme = ""
new_datapackage["title"] =  "JCOIN Hub Core Measures: PROMIS subset"
readme += "# " + new_datapackage["title"]
readme += "\n\n"
new_datapackage["version"] = version
readme += "__version" + version + "__"
readme += "\n\n"
new_datapackage["description"] = (

   "This data package is a subset of JCOIN Hub Core Measures consisting of:\n\n"
    "## Sections:\n\n "
    f'{sections_str}'

    
    f"{README}"
)
readme += new_datapackage["description"]
new_datapackage.update(datapackage)


path.write_text(json.dumps(new_datapackage,indent=2))
path.with_name("README.md").write_text(readme)

1538

In [18]:
core_measures.zip()

<jdc_utils.core_measures.core_measures.CoreMeasures at 0x25374426f80>

## Upload to JDC

In [19]:
# upload to JDC
fileupdate_params = dict(commons_program = "JCOIN",
commons_project = "CollaborativeProjects",
commons_bucket = "s3://jcoinprod-default-258867494168-upload",
new_file_path=f"{package_name}.zip",
file_guid="dg.6VTS/16ddc96d-ce81-4e97-84bf-200a2d60a284",
sheepdog_file_submitter_id=package_name,
credentials_path="../../credentials.json")

gen3file = jdc_utils.utils.gen3.Gen3FileUpdate(**fileupdate_params)
# TODO: add indexd params to Gen3File create and update?
# TODO: add authz to indexd params?
gen3file.authz = ["/restricted/collaborative_project/promis"]

[2024-01-24 14:51:53,449][  ERROR] backoff: gave up call gen3.auth._write_to_file(<gen3.auth.Gen3Auth object at 0x000002530400BB80>, C:\Users\kranz-michael/.cache/gen3/token_cache_59a84d7ef796aef44d95e557d46cc075, eyJhbGciOiJSUzI1NiIsImtpZCI6ImZlbmNlX2tleV8yMDIwLTAxLTAyVDA2OjM1OjA4WiIsInR5cCI6IkpXVCJ9.eyJwdXIiOiJhY2Nlc3MiLCJpc3MiOiJodHRwczovL2pjb2luLmRhdGFjb21tb25zLmlvL3VzZXIiLCJhdWQiOlsiaHR0cHM6Ly9qY29pbi5kYXRhY29tbW9ucy5pby91c2VyIiwib3BlbmlkIiwiZ29vZ2xlX2NyZWRlbnRpYWxzIiwidXNlciIsImFkbWluIiwiZmVuY2UiLCJnb29nbGVfbGluayIsImRhdGEiLCJnb29nbGVfc2VydmljZV9hY2NvdW50Il0sImlhdCI6MTcwNjEyOTUxMiwiZXhwIjoxNzA2MTMzMTEyLCJqdGkiOiJjYjc3ZjYxOC00M2M0LTRiMGYtYjNiZS1lMTM0NTk4N2E1NzYiLCJzY29wZSI6WyJvcGVuaWQiLCJnb29nbGVfY3JlZGVudGlhbHMiLCJ1c2VyIiwiYWRtaW4iLCJmZW5jZSIsImdvb2dsZV9saW5rIiwiZGF0YSIsImdvb2dsZV9zZXJ2aWNlX2FjY291bnQiXSwiY29udGV4dCI6eyJ1c2VyIjp7Im5hbWUiOiJtYmtyYW56QGdtYWlsLmNvbSIsImlzX2FkbWluIjpmYWxzZSwiZ29vZ2xlIjp7InByb3h5X2dyb3VwIjpudWxsfX19LCJhenAiOiIiLCJzdWIiOiI3OSJ9.oHcpZPZVsqh8J4-GnM4BjYAL

In [20]:
gen3file.latest_index

{'acl': [],
 'authz': ['/restricted/collaborative_project/promis'],
 'baseid': '89f88d5b-aa85-4234-9d6d-432b2fbb2436',
 'content_created_date': None,
 'content_updated_date': None,
 'created_date': '2024-01-23T15:14:13.597750',
 'description': None,
 'did': 'dg.6VTS/7591072a-ae60-4d26-9cf0-054f78376e65',
 'file_name': 'core-measures-promis.zip',
 'form': 'object',
 'hashes': {'md5': 'c9fe98227ceaf3b436e8b167e05a7d2e'},
 'metadata': {},
 'rev': 'd19f1321',
 'size': 16238484,
 'updated_date': '2024-01-23T15:14:29.304575',
 'uploader': None,
 'urls': ['s3://jcoinprod-default-258867494168-upload/dg.6VTS/7591072a-ae60-4d26-9cf0-054f78376e65/core-measures-promis.zip'],
 'urls_metadata': {'s3://jcoinprod-default-258867494168-upload/dg.6VTS/7591072a-ae60-4d26-9cf0-054f78376e65/core-measures-promis.zip': {}},
 'version': None}

In [21]:
# gen3file.create(
#      file_node_submitter_id=package_name,
#         cmc_node_submitter_id=package_name,
#         data_category="Core Measures",
#         data_format="ZIP",
#         data_type="Interview",
#     other_cmc_node_metadata={"title":"Core Measures - Only Demographics and PROMIS"}
    
#     )
gen3file.update()

  Submitting 12 records in batches of 100
Submission progress: 1/1


<jdc_utils.utils.gen3.files.Gen3FileUpdate at 0x2537ca50070>

## Deleting test files and records

First grab the most recent records from indexd and sheepdog. Confirm this is indeed the file you just created and uploaded:

Note, need you need to update the file guid if wanting to test update function after creating a new record.

In [None]:
# from gen3.auth import Gen3Auth
# from gen3.submission import Gen3Submission
# from gen3.index import Gen3Index
# from gen3.file import Gen3File
# credentials_path = "../../credentials.json"
# index = Gen3Index(Gen3Auth(refresh_file=credentials_path))
# sub = Gen3Submission(Gen3Auth(refresh_file=credentials_path))
# files = Gen3File(Gen3Auth(refresh_file=credentials_path))
# sheepdog_rec = sub.export_node("JCOIN","TEST","reference_file",fileformat="json")

# for record in sheepdog_rec["data"]:
#     if "promis" in record["file_name"]:
#         promis_object_id = record["object_id"]
# index_rec = index.get_record(promis_object_id)

[2023-08-03 11:24:58,738][  ERROR] backoff: gave up call gen3.auth._write_to_file(<gen3.auth.Gen3Auth object at 0x0000022C543DEE00>, C:\Users\kranz-michael/.cache/gen3/token_cache_aa93538a89566456c2ac2f587fc875c1, eyJhbGciOiJSUzI1NiIsImtpZCI6ImZlbmNlX2tleV8yMDIwLTAxLTAyVDA2OjM1OjA4WiIsInR5cCI6IkpXVCJ9.eyJwdXIiOiJhY2Nlc3MiLCJpc3MiOiJodHRwczovL2pjb2luLmRhdGFjb21tb25zLmlvL3VzZXIiLCJhdWQiOlsiaHR0cHM6Ly9qY29pbi5kYXRhY29tbW9ucy5pby91c2VyIiwib3BlbmlkIiwiZ29vZ2xlX3NlcnZpY2VfYWNjb3VudCIsImFkbWluIiwiZmVuY2UiLCJnb29nbGVfY3JlZGVudGlhbHMiLCJkYXRhIiwidXNlciIsImdvb2dsZV9saW5rIl0sImlhdCI6MTY5MTA3OTg5NiwiZXhwIjoxNjkxMDgzNDk2LCJqdGkiOiIzYjc5ZjUyYy0zYmExLTQ1YzItOTU0YS03Mzg1YzcyOWVlOTUiLCJzY29wZSI6WyJvcGVuaWQiLCJnb29nbGVfc2VydmljZV9hY2NvdW50IiwiYWRtaW4iLCJmZW5jZSIsImdvb2dsZV9jcmVkZW50aWFscyIsImRhdGEiLCJ1c2VyIiwiZ29vZ2xlX2xpbmsiXSwiY29udGV4dCI6eyJ1c2VyIjp7Im5hbWUiOiJtYmtyYW56QGdtYWlsLmNvbSIsImlzX2FkbWluIjpmYWxzZSwiZ29vZ2xlIjp7InByb3h5X2dyb3VwIjpudWxsfX19LCJhenAiOiIiLCJzdWIiOiI3OSJ9.M5GpEzf4UhkvJtgzlWSOxPmH

Then delete all file locations and the sheepdog record:

In [None]:
# files.delete_file_locations(index_rec["did"])
# sub.delete_record("JCOIN","TEST",promis_object_id)

Error: status code 404; details:
{"message":"The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again."}


{"message":"The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again."}

Failed to delete uuids: ['dg.6VTS/1e75246d-26ed-4a5d-8f7b-b1790850809c']


HTTPError: 404 Client Error: NOT FOUND for url: https://jcoin.datacommons.io/api/v0/submission/JCOIN/TEST/entities/dg.6VTS/1e75246d-26ed-4a5d-8f7b-b1790850809c