## Which projects

This script iterates through ES query results of the DSS and returns a list of unique projects that the bundles covered.

Edit the elastic search query below to change the search criteria.
Note the list of blacklisted projects was taken from https://github.com/DataBiosphere/azul/blob/c0937a7dcbd6f70f6d28ce004f4005c8eac0d9dc/src/azul/project/hca/__init__.py#L90

In [10]:
initial_q = {
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "files.library_preparation_protocol_json.library_construction_approach.ontology": "EFO:0008931"
                    }
                },
                {
                    "match": {
                        "files.sequencing_protocol_json.paired_end": 'true'
                    }
                },
                {
                    "match": {
                        "files.donor_organism_json.biomaterial_core.ncbi_taxon_id": 9606
                    }
                }
            ],
            "must_not": [
                {
                    "terms": {
                        "files.project_json.provenance.document_id": [
                            "1630e3dc-5501-4faf-9726-2e2c0b4da6d7",
                            "fd1d163d-d6a7-41cd-b3bc-9d77ba9a36fe",
                            "2a0faf83-e342-4b1c-bb9b-cf1d1147f3bb",
                            "cf8439db-fcc9-44a8-b66f-8ffbf729bffa",
                            "6b9f514d-d738-403f-a9c2-62580bbe5c83",
                            "311d013c-01e4-42c0-9c2d-25472afa9cbc",
                            "d237ed6a-3a7f-4a91-b300-b070888a8542",
                            "e6cc0b02-2125-4faa-9903-a9025a62efec",
                            "e4dbcb98-0562-4071-8bea-5e8de5f3c147",
                            "e79e9284-c337-4dfd-853d-66fa3facfbbd",
                            "560cd061-9165-4699-bc6e-8253e164c079",
                            "e83fda0e-6515-4f13-82cb-a5860ecfc2d4",
                            "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                            "71a6e049-4846-4c2a-8823-cc193c573efc",
                            "4b5a2268-507c-46e6-bab0-3efb30145e85",
                            "364ebb73-652e-4d32-8938-1c922d0b2584",
                            "11f5d59b-0e2c-4f01-85ac-8d8dd3db53be",
                            "c1996526-6466-40ff-820f-dad4d63492ec",
                            "c281dedc-e838-4464-bf51-1cc4efae3fb9",
                            "40afcf6b-422a-47ba-ba7a-33678c949b5c",
                            "71a6e049-4846-4c2a-8823-cc193c573efc",
                            "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                            "0facfacd-5b0c-4228-8be5-37aa1f3a269d",
                            "76c209df-42bf-41dc-a5f5-3d27193ca7a6",
                            "bb409c34-bb87-4ed2-adaf-6d1ef10610b5",
                            "1a6b5e5d-914f-4dd6-8817-a1f9b7f364d5",
                            "dd401943-1059-4b2d-b187-7a9e11822f95"
                        ]
                    }
                }
            ]
        }
    }
}

In [None]:
from tqdm import tqdm_notebook
import hca.dss
from hca.dss import DSSClient

projects = []
dss_client = DSSClient(swagger_url= "https://dss.data.humancellatlas.org/v1/swagger.json")
                       
for rel in tqdm_notebook(dss_client.post_search.iterate(replica="aws", es_query=initial_q, output_format="raw")):
    project_uuid = rel.get('metadata').get('files').get('project_json')[0].get('provenance').get('document_id')
    if project_uuid not in projects:
        projects.append(project_uuid)
print('These projects match your query {}'.format(projects))


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

## Project UUID more info

This script adds in more information about the project picked up by the first part of the script.

In [8]:
import json
import sys
import hca.dss
from hca.dss import DSSClient
import pandas as pd

project_q = {
        "query": {
            "bool": {
                "must": [
                    {
                        "terms": {
                            "files.project_json.provenance.document_id": [
                                "PROJECT_UUID"
                            ]
                        }
                    }
                ]
            }
    }
}


dss_client = DSSClient(swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")

project_info = {}
for uuid in projects:
#     q = str(project_q).replace('PROJECT_UUID', uuid)

    project_q.get('query').get('bool').get('must')[0].get('terms')["files.project_json.provenance.document_id"] = [uuid]
    q = json.loads(json.dumps(project_q)) # this is just to ensure the query is in the correct format
    rel = next(dss_client.post_search.iterate(replica="aws", es_query=q, output_format="raw"))

    project_i = rel.get('metadata').get('files').get('project_json')[0]
    project_uuid = rel.get('metadata').get('files').get('project_json')[0].get('provenance').get('document_id')
    project_short_name = project_i.get('project_core').get('project_short_name')
    project_title = project_i.get('project_core').get('project_title')
    supplementary_links = project_i.get('supplementary_links')
    entry = {'Project short name' : project_short_name, 'Project title': project_title, 'Links' : supplementary_links}
    project_info[project_uuid] = entry
df = pd.DataFrame.from_dict(project_info, orient='index')
df




Unnamed: 0,Project short name,Project title,Links
0ec2b05f-ddbe-4e5a-b30f-e81f4b1e330c,CD4+ cytotoxic T lymphocytes,Precursors of human CD4+ cytotoxic T lymphocyt...,[https://www.ebi.ac.uk/gxa/sc/experiments/E-GE...
1a0f98b8-746a-489d-8af9-d5c657482aab,Healthy and type 2 diabetes pancreas,Single-cell RNA-seq analysis of human pancreas...,[https://www.ebi.ac.uk/gxa/sc/experiments/E-MT...
aabbec1a-1215-43e1-8e42-6489af25c12c,Fetal/Maternal Interface,Reconstructing the human first trimester fetal...,[https://www.ebi.ac.uk/arrayexpress/experiment...
e8642221-4c2c-4fd7-b926-a68bce363c88,Single cell transcriptome analysis of human pa...,Single cell transcriptome analysis of human pa...,[https://www.ebi.ac.uk/gxa/sc/experiments/E-GE...
f8880be0-210c-4aa3-9348-f5a423e07421,An in vitro model of human inhibitory interneu...,Single-cell RNA-seq analysis throughout a 125...,[https://www.ebi.ac.uk/gxa/sc/experiments/E-GE...


## Save

This stub saves the results and the original query. **Make sure you change the file name** is running multiple queries.

In [9]:
output_name = 'paired_end_true_nshould'
df.to_csv(output_name + '.csv')

with open(output_name + '.json', 'w') as fp:
    json.dump(initial_q, fp)





