# Preparing for JDC uploads

This notebook contains the code that will be refactored/packaged into data packaging/upload functions

1. Creating joint dataset
2. writing core measures to Stata and SPSS files
3. compress
4. map to sheepdog
5. upload both sheepdog and core measure files

In [4]:
import json
from pathlib import Path
from jdc_utils.utils import zip_package
from jdc_utils.encoding import core_measures as encodings
from dataforge.frictionless.categoricals import table_encode
from frictionless import Resource,Package,transform

from jdc_utils.schema import core_measures as schemas

import os

In [5]:
jdc_utils_pwd = Path().resolve().parent

In [6]:
os.chdir(jdc_utils_pwd)
datapackage_paths = [
    path.resolve() 
    for path in Path().glob("../*/*/core-measures-*/") 
    if path.is_dir() and not 'joint' in path.name
]

In [7]:
datapackage_paths

[WindowsPath('C:/Users/kranz-michael/projects/jcoin-chestnut/tmp/core-measures-chestnut'),
 WindowsPath('C:/Users/kranz-michael/projects/jcoin-nyu/tmp/core-measures-nyu'),
 WindowsPath('C:/Users/kranz-michael/projects/jcoin-uky/tmp/core-measures-uky'),
 WindowsPath('C:/Users/kranz-michael/projects/jcoin-yale-hiv/tmp/core-measures-yale_hiv')]

# Make combined data package

In [6]:
os.chdir(jdc_utils_pwd)
jointdatapath = jdc_utils_pwd/'tmp'/'core-measures-joint'
jointdatapath.mkdir(exist_ok=True,parents=True)

schemapath = jointdatapath.joinpath('schemas')
schemapath.mkdir(exist_ok=True,parents=True)

num_valid_packages = sum([json.loads(p.joinpath('report.json').read_text())['valid'] 
    for p in datapackage_paths])
assert num_valid_packages==len(datapackage_paths)

# write resources
for resource_name in ['baseline','time-points']:

    resource_schema = getattr(schemas,resource_name.replace('-','')) # timepoints v time-points
    resource_schema.to_json(schemapath/(resource_name+'.json'))

    source_paths = [(p/'data'/f"{resource_name}.csv").as_posix() for p in datapackage_paths]
    source = Resource(path=source_paths,schema=resource_schema)
    source.write(jointdatapath/'data'/f'{resource_name}.csv')

    target_spss = transform(source,steps=[table_encode(encodings.fields,encodings.reserve['spss'])])
    target_stata = transform(source,steps=[table_encode(encodings.fields,encodings.reserve['stata'])])
    target_spss.write(jointdatapath/'data'/(resource_name+".sav"))
    target_stata.write(jointdatapath/'data'/(resource_name+".dta"))

In [7]:
# init Package
os.chdir(jointdatapath.parent)
package = Package(
    title="Cross-hub dataset",
    description="A combined data package of all submitted hub data to-date",
    resources=[Resource(name=resource,path=f'data/{resource}')
         for resource in ['baseline','time-points']]
)
package.to_json('data-package.json')

'{\n  "resources": [\n    {\n      "name": "baseline",\n      "path": "data/baseline"\n    },\n    {\n      "name": "time-points",\n      "path": "data/time-points"\n    }\n  ],\n  "title": "Cross-hub dataset",\n  "description": "A combined data package of all submitted hub data to-date"\n}'

## Write Stata and SPSS

In [None]:
os.chdir(jdc_utils_pwd)



In [5]:


for path in datapackage_paths:
    os.chdir(path)
    report_path = Path('report.json')
    if report_path.exists():
        report = json.loads(report_path.read_text()) #this is a cherry picked version just to indicate whether datapackage is valid
        package = Package('data-package.json')
    else:
        report = {'valid':False}
    if report['valid']:
        print(package['name'])
        #write stata and spss
        for source in package['resources']:
            print(f"-->{source['name']}")
            source_path = Path(source['path'])
            target_spss = transform(source,steps=[table_encode(encodings.fields,encodings.reserve['spss'])])
            target_stata = transform(source,steps=[table_encode(encodings.fields,encodings.reserve['stata'])])
            target_spss_params = target_spss.write(source_path.with_suffix(".sav"))
            target_stata_params = target_stata.write(source_path.with_suffix(".dta"))
    else:
        print(f"{path.stem} is not valid or report.json not generated so skipping stata/spss generation")

core-measures-chestnut
-->baseline
-->time-points
core-measures-nyu
-->baseline
-->time-points
core-measures-uky
-->baseline
-->time-points
core-measures-yale_hiv
-->baseline
-->time-points


In [9]:
#compress to directory
for path in datapackage_paths:
    zip_path = jdc_utils_pwd/'tmp'
    zip_path.mkdir(exist_ok=True)
    zip_package(path,jdc_utils_pwd/'tmp')