My goal in this notebook is to normalize the attribute type names. I am going through all of the attribute types `samples/attributes/name` from the SRA and trying to decide if they are related to {sex, developmental_stage, tissue, or cell_type}. I am storing the mappings in `config/attribute_type.yaml`.

In [1]:
from time import sleep
import joblib
import yaml
import pyperclip
from more_itertools import flatten
from pymongo import MongoClient
from IPython.display import display, Markdown, clear_output

# Get a list of Attribute Types

In [2]:
rnaseq_srxs = joblib.load("../output/library_strategy-wf/rnaseq_inliers.pkl")

with MongoClient() as client:
    db = client["sramongo"]
    ncbi = db["ncbi"]

    attribute_types = sorted(list({
        x["attribute_type"]
        for x in ncbi.aggregate(
            [
                {"$match": {"srx": {"$in": rnaseq_srxs}}},
                {"$unwind": {"path": "$sample.attributes"}},
                {"$project": {"_id": 0, "attribute_type": "$sample.attributes.name"}},
            ]
        )
    }), key=lambda x: x.lower())

print(f"There are {len(attribute_types):,} unique attribute types.")

There are 347 unique attribute types.


# Map Attribute Types to CV

In [3]:
def get_values(attribute_type):
    """Given an attribute type, give a set of all values."""
    with MongoClient() as client:
        db = client["sramongo"]
        ncbi = db["ncbi"]

        return {
            x["attribute_value"]
            for x in ncbi.aggregate(
                [
                    {"$unwind": {"path": "$sample.attributes"}},
                    {"$match": {"sample.attributes.name": attribute_type, "srx": {"$in": rnaseq_srxs}}},
                    {"$project": {"_id": 0, "attribute_value": "$sample.attributes.value"}},
                ]
            )
        }

In [12]:
# Print out attributes not yet in the mapper.

def keep_running():
    mapped_attributes = list(flatten(yaml.safe_load(open("config/attribute_type.yaml")).values()))
    for attribute_type in attribute_types:
        if attribute_type not in mapped_attributes:
            clear_output(wait=True)
            display(Markdown(f"## {attribute_type}"))
            pyperclip.copy(f'  - "{attribute_type}"\n')
            print(get_values(attribute_type), end="\r")
            sleep(5)
            keep_running()
keep_running()

## zygotic genotype

{'l(3)mbt[GM76]/Df', 'l(3)mbt[GM76]/+'}