![](https://img.shields.io/badge/4/4-lightgrey)

# Link cellxgene-census files to registered metadata

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

In [None]:
ln.track()

In [None]:
census = cellxgene_census.open_soma(census_version="2023-07-25")

In [None]:
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()

In [None]:
datasets_df

In [None]:
files = ln.File.filter()

In [None]:
presence_matrix_dict = {}
gene_metadata_dict = {}
no_data = []

In [None]:
for _, row in datasets_df.iloc[499:].iterrows():
    print(f"dataset: {row.dataset_id}")

    file = files.filter(description__contains=row.dataset_id).one()

    # get organism
    organism_record = files.first().organism.all().one()
    lb.settings.organism = organism_record.name
    organism = organism_record.scientific_name
    census_data = census["census_data"][organism]

    feature_sets = {}

    # obs feature set
    obs = (
        census_data.obs.read(value_filter=f"dataset_id == '{row.dataset_id}'")
        .concat()
        .to_pandas()
    )
    if obs.shape[0] == 0:
        no_data.append(row.dataset_id)
        continue
    feature_set_obs = ln.FeatureSet.from_df(
        obs.loc[:, ~obs.columns.str.endswith("_id")],
    )
    feature_sets["obs"] = feature_set_obs

    # var feature set
    if organism not in presence_matrix_dict:
        presence_matrix_dict[organism] = cellxgene_census.get_presence_matrix(
            census, organism=organism, measurement_name="RNA"
        )
    presence_matrix = presence_matrix_dict.get(organism)
    var_joinid = presence_matrix[row.soma_joinid, :].tocoo().col
    if organism not in gene_metadata_dict:
        gene_metadata_dict[organism] = (
            census_data.ms["RNA"].var.read().concat().to_pandas()
        )
    gene_metadata = gene_metadata_dict.get(organism)
    var = gene_metadata.loc[gene_metadata.soma_joinid.isin(var_joinid)]
    feature_set_var = ln.FeatureSet.from_values(
        var.feature_id,
        lb.Gene.ensembl_gene_id,
        type="number",
    )
    feature_sets["var"] = feature_set_var

    # link two feature sets to file
    file._feature_sets = feature_sets
    file.save()

    # add labels to file
    for feature in feature_set_obs.members:
        if feature.type == "category":
            file.labels.add(obs[feature.name], feature)

In [None]:
file.describe()

## no data datasets

Querying the following datasets from census didn't result any data:

In [None]:
no_data

In [None]:
len(no_data)