✨ Add access_urls to indexd models to point to gen3 locations

kids-first · Dec 12, 2018 · 9be2d69 · 9be2d69
1 parent e6ba013
commit 9be2d69
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 3 deletions.
diff --git a/config.py b/config.py
@@ -25,6 +25,8 @@ class Config:
     INDEXD_USER = os.environ.get('INDEXD_USER', 'test')
     INDEXD_PASS = os.environ.get('INDEXD_PASS', 'test')
 
+    GEN3_URL = os.environ.get('GEN3_URL', 'gen3')
+
     BUCKET_SERVICE_URL = os.environ.get('BUCKET_SERVICE_URL', None)
     BUCKET_SERVICE_TOKEN = os.environ.get('BUCKET_SERVICE_TOKEN', None)
     SNS_EVENT_ARN = os.environ.get('SNS_EVENT_ARN', None)

diff --git a/dataservice/api/common/model.py b/dataservice/api/common/model.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from flask import abort
+from flask import abort, current_app
 from requests.exceptions import HTTPError
 import sqlalchemy.types as types
 from sqlalchemy import event, inspect
@@ -105,6 +105,25 @@ def constructor(self):
         # Update fields from indexd
         self.merge_indexd()
 
+    @property
+    def access_urls(self):
+        """
+        Access urls should contain only links out to gen3 data endpoints
+        that are used to download the file's themselves.
+
+        For urls that are already https:// urls, we will consider them as
+        valid gen3 locations, for urls that are s3:// protocol, we will assume
+        that they are internal files and resolve them to our gen3 service
+        """
+        urls = []
+        for url in self.urls:
+            if url.startswith('s3://'):
+                url = (f'{current_app.config["GEN3_URL"]}'
+                       f'/data/{self.latest_did}')
+            urls.append(url)
+
+        return urls
+
     def merge_indexd(self):
         """
         If the document matching this object's latest_did cannot be found in

diff --git a/dataservice/api/common/schemas.py b/dataservice/api/common/schemas.py
@@ -100,6 +100,7 @@ def check_unknown_fields(self, data, original_data):
 
 class IndexdFileSchema(Schema):
     urls = ma.List(ma.Str(), required=True)
+    access_urls = ma.List(ma.Str(), dump_only=True)
     acl = ma.List(ma.Str(), required=False)
     file_name = ma.Str()
     hashes = ma.Dict(required=True)

diff --git a/tests/genomic_file/test_genomic_file_models.py b/tests/genomic_file/test_genomic_file_models.py
@@ -307,7 +307,6 @@ def gf_se(bs, strategy='wgs'):
         # file has been derived from, in this case, 'wgs' and 'wxs'
         # assert set(gf3.experiment_strategy) == {'wxs', 'wgs'}
 
-
     # TODO Check that file is not deleted if deletion on indexd fails
 
     def _create_save_genomic_files(self):

diff --git a/tests/genomic_file/test_genomic_file_resources.py b/tests/genomic_file/test_genomic_file_resources.py
@@ -449,6 +449,18 @@ def test_filter_by_bs(client, indexd):
         assert gf['external_id'] in _ids
 
 
+def test_access_urls(client):
+    """
+    The access_urls field should be a field derived from the urls replacing
+    s3 locations with gen3 http locations
+    """
+    rgs, gfs, studies = _create_all_entities()
+    gf = list(gfs.values())[0][0]
+    gf = client.get(f'/genomic-files/{gf.kf_id}').json['results']
+    assert gf['access_urls'] == [f'gen3/data/{gf["latest_did"]}',
+                                 f'https://gen3.something.com/did']
+
+
 def _new_genomic_file(client, include_seq_exp=True):
     """ Creates a genomic file """
     body = {
@@ -494,7 +506,7 @@ def _create_all_entities():
                             participant=p)
             gf = GenomicFile(
                 external_id='study{}-gf{}'.format(j, i),
-                urls=['s3://mybucket/key'],
+                urls=['s3://mybucket/key', 'https://gen3.something.com/did'],
                 hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'})
             study_gfs.append(gf)
             b.genomic_files.append(gf)