-
Notifications
You must be signed in to change notification settings - Fork 0
/
reporting.py
75 lines (65 loc) 路 2.39 KB
/
reporting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from collections import defaultdict
from d3b_utils.aws_bucket_contents import fetch_bucket_obj_info
from kf_utils.dataservice.scrape import yield_entities
def merge_s3_and_kf_gfs(
ds_url, study_kfid, study_bucket, exclude_s3_keypaths=None
):
"""Return file data from S3 and the Kids First dataservice merged together
on external_id to see which S3 files have been loaded into the data service
and which loaded files no longer exist.
Note: You must be able to query both S3 and the dataservice
(VPN + chopaws if running locally)
:param study_kfid: Dataservice KFID of the study
:type study_kfid: string
:param study_bucket: Amazon S3 bucket containing study files
:type study_bucket: string
:param exclude_s3_keypaths: S3 paths starting with these strings will be
excluded, optional, defaults to None
:type exclude_s3_keypaths: string, iterable
:return: list of dicts
"""
# Files from the dataservice
# We use the API because direct DB queries won't give us the gen3 fields
kf = {
e["external_id"]: {
f"kf_{k.lower()}": v
for k, v in e.items()
if k not in ["_links", "access_urls", "urls"]
}
for e in yield_entities(
ds_url,
"genomic-files",
{"study_id": study_kfid},
show_progress=True,
)
}
# Files from S3
s3 = {
"s3://"
+ o["Bucket"]
+ "/"
+ o["Key"]: {f"s3_{k.lower()}": v for k, v in o.items()}
for o in fetch_bucket_obj_info(
study_bucket,
drop_folders=True,
)
}
# Sadly it's muuuuch harder to exclude paths on the S3 request side because
# the S3 API doesn't support it. So we're stuck for now waiting for
# potentially thousands of pagination requests that we don't care about,
# and then we remove them here.
if exclude_s3_keypaths:
if isinstance(exclude_s3_keypaths, str):
exclude_s3_keypaths = (exclude_s3_keypaths,)
elif exclude_s3_keypaths is not None:
exclude_s3_keypaths = tuple(exclude_s3_keypaths)
s3 = {
k: v
for k, v in s3.items()
if not v["s3_key"].startswith(exclude_s3_keypaths)
}
# Merge them together
s3kf = defaultdict(dict, s3)
for k, v in kf.items():
s3kf[k].update(v)
return list(s3kf.values())