In [3]:
import anndata as ad
import cellxgene_census

import pronto
import warnings
warnings.filterwarnings("ignore", category=pronto.warnings.ProntoWarning)


import pandas as pd
import numpy as np
from scipy import sparse
import copy
import time
import sys
import os
import pickle
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler

from torcheval.metrics.functional import multilabel_accuracy

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



## Load Cell MetatData from the Census. 

Great-lakes does not have internet access, so we pull in the data outside first.

In [4]:
# gene and cell type info stored on Turbo
os.chdir('/nfs/turbo/umms-welchjd/mccell')

In [5]:
# load gene list
biomart = pd.read_csv('mart_export.txt')

coding_only = biomart[biomart['Gene type'] == 'protein_coding']

gene_list = coding_only['Gene stable ID'].to_list()

var_val_filter = '''feature_id in {}'''.format(gene_list)

# load the cell type list
cell_type_list_name = 'cell_type_list.txt'
with open(cell_type_list_name,'rb') as fp:
    cell_type_list = pickle.load(fp)

obs_val_filter = '''assay == "10x 3\' v3" and is_primary_data == True and cell_type_ontology_term_id in {}'''.format(cell_type_list)


In [6]:
census = cellxgene_census.open_soma(uri = "/scratch/welchjd_root/welchjd99/fujoshua/soma")


Now, get the metadata for our search. We can then use this to preprocess everything. 

In [23]:
target_column = ['cell_type_ontology_term_id']


cell_obs_metadata = (
    census["census_data"]["homo_sapiens"].obs.read(value_filter = obs_val_filter,
                                                   column_names=target_column).concat().to_pandas()
    )




In [25]:
cell_obs_metadata

Unnamed: 0,cell_type_ontology_term_id,assay,is_primary_data
0,CL:0000763,10x 3' v3,True
1,CL:0000763,10x 3' v3,True
2,CL:0000763,10x 3' v3,True
3,CL:0000763,10x 3' v3,True
4,CL:0000542,10x 3' v3,True
...,...,...,...
2726024,CL:0000860,10x 3' v3,True
2726025,CL:0000860,10x 3' v3,True
2726026,CL:0000860,10x 3' v3,True
2726027,CL:0000940,10x 3' v3,True


## Load the Cell Ontology

The Cell Ontology also needs to be loaded for access. 

You can visualize the ontology using https://www.ebi.ac.uk/ols4/ontologies/cl

And you can download the ontology file here: https://obofoundry.org/ontology/cl.html

In [17]:
os.chdir('/scratch/welchjd_root/welchjd99/fujoshua')


In [19]:
cl = pronto.Ontology.from_obo_library('cl.owl')


## Data and Ontology Preprocessing

To prepare the data for modeling, we need to perform some preprocessing on the data and the ontology. We'll use three functions to make this happen. Full descriptions of these functions can be found in the functions. 


- set_internal_node_values: build a dictionary to set which internal nodes are to be used in the loss calculation for internal nodes in the data

- build_parent_mask: builds a masking matrix to use for masking internal node loss values

- preprocess_data_ontology: this function encodes the AnnData object, splits apart the target values and primary data, and calculates some important variables from the Cell Ontology for later use 

- transform_data: transforms the data with log(1+x)

- split_format_data: splits the data into train and validation sets, and moves the variables to PyTorch tensors 


In [20]:
def set_internal_node_values(internal_values,all_parent_nodes):
    '''
    Creates a dictionary where each key is an internal cell type and the values are the cell types
    we want to include when calculating the loss. We do not want to consider direct descendents of the
    internal cell type, so those are removed. 
    
    In other words, when calculating the loss for an internal node, we want to include all internal 
    nodes in the ontology EXCEPT those that are direct descendants of the target internal node. 
    
    Parameters
    ----------
    internal_values : list
        list of internal values that are included in the dataset
            
    all_parent_nodes : list
        from the dataset, a list of parent nodes in the ontology. Used to remove portions of
        the Ontology where we do not have child data
    
    Returns
    ----------
    parent_dict : dictionary
        keys are internal_values and values are all internal cell ontology terms EXCEPT descendents 
        of the internal value. The internal value is always included
    '''
    
    parent_dict = {}

    # loop through each value to calculate the values to include in parent_dict for that
    # internal value
    for internal_node in internal_values:
        # 1) get the children of this internal_node
        child_nodes = []
        for term in cl[internal_node].subclasses(distance=None,with_self=False).to_set():
            child_nodes.append(term.id)
        
        # 2) remove those values from all_parent_nodes
        cell_types_to_include = [x for x in all_parent_nodes if x not in child_nodes]
        
        # 3) create dictionary
        parent_dict[internal_node] = cell_types_to_include
    
    return(parent_dict)


In [21]:
def build_parent_mask(leaf_values,internal_values,ontology_df,parent_dict):
    '''
    Function to build a masking matrix for use when calculating the internal loss
    
    Uses parent_dict to denote, for internal cell types, which parents to include in the loss
    calculation. 
    
    Parameters
    -------
    leaf_values : list
        list composed of all leaf values included in the dataset
        includes internal nodes that do not have sub-values in the dataset, and thus are
        treated an leaf nodes

    internal_values : list
        list composed of interanal nodes in the dataset

    ontology_df : pandas dataframe
        pandas dataframe where indices (rows) are all leaf and parent cell IDs from the portion of 
        the ontology being queried, and columns are all leafs in portion of ontology being queried. 
        
        Dataframe is binary. For each parent node, element = 1 if parent node is an ancestor
        of corresponding leaf node.
        
    parent_dict : dictionary
        keys are internal_values and values are all internal cell ontology terms EXCEPT descendents 
        of the internal value. The internal value is always included
    
    Returns
    -------
    cell_parents_mask : tensor
        tensor of shape ik, where i = parent IDs and k = each cell type in the dataset
        binary tensor where 1 means for that cell type, that parent ID will be included
        in the internal loss calculation
        and 0 means for that cell type, that parent ID is excluded in the internal loss
        calculation
    
    '''
    num_leafs = len(leaf_values)
    num_parents = ontology_df.shape[0]

    # internal_values are included as column values AND rows


    # for cell_parents_to_include, each column is a cell type included in the
    # dataset, so it is length = len(leaf_values) + len(internal_values)
    # the row values are the total number of parents included for the dataset 
    # for each internal value, we need to pick (1/0) if we include that parent
    # for the loss. For this, we reference parents_dict
    # WHAT is the order of the cell IDs for the rows???? This is important
    # This needs to match what we are already doing later, so let's go figure that out FIRST. 

    # for the leaf values, we want to include ALL parents in the 
    # loss calculation. So, we initialize the tensor as a ones tensor
    # based on the number of leaf values and the number of parents
    cell_parents_mask = torch.ones(num_parents,num_leafs)

    # now we can deal with the internal values. For these, we will not
    # include all parents. We will use parent_dict to select which to include


    # first, get a list of all the parents. The ordering of this list
    # is used later to propogate probabilities up the ontology.
    list_of_parents = ontology_df.index.tolist()

    # now, we need to loop through each internal value
    # internal_values is ordered as -9999 + n
    # this will be helpful later when we need to pull these values out. 
    # so the columns here are ordered at 0 to (number of leaf values), then -9999
    # to (number of internal values)

    for cell_id in internal_values:
        # get the list of parent cell IDs we want to include for this
        # particular internal_values
        parent_list_for_cell = parent_dict[cell_id]

        # loop through the parent_list_for cell, create a new binary list where
        # list is 1 if the parent is in the list_of_parents, otherwise 0
        parent_binary_list = [1 if parent in parent_list_for_cell else 0 for parent in list_of_parents]

        # convert the list to a tensor and reshape for concatenation
        parent_binary_tensor = torch.tensor(parent_binary_list).reshape(-1,1)

        # append to cell_parents_to_include. 
        # we append along columns
        cell_parents_mask = torch.cat((cell_parents_mask,parent_binary_tensor),1)

    return(cell_parents_mask)



In [26]:
def preprocess_data_ontology(obs_metadata, target_column,upper_limit = None, cl_only = False, include_leafs = False):
    '''
    This function perfroms preprocessing on ann AnnData object to prepare it for modelling. It will encode the 
    target column and returns x_data and y_data for modelling
    
    This function also preprocesses the ontology to build a pandas dataframe that can be used to 
    calculate predicted probabilities. This will enable simple matrix multiplication to calculate
    probabilities and loss.
    
    Can have an upper limit to the ontology if upper_limit is set
    
    
    Assumes there is an active census object already open as cl. 

    
    
    Parameters
    ----------
    cell_obs_metadata : Pandas DataFrame
        DataFrame from census.obs.read() 
        
    target_column : string
        string of target column (from cell metadata) to encode
     
    upper_limit : string
        if you want to specify an upper limit in the ontology, set this to 
        the upper limit (inclusive)
        Default: None (no limit to ontology)
        
    cl_only : boolean
        option to only include the Cell Ontology (CL) in the dataframe
        True means only those cell IDs that start with CL are included
        Default: False
        
    include_leafs : boolean
        option to include leafs in the list of parent cell IDs
        Default is False because we are calculating the leaf loss differently
        Default: False
        
    Returns
    -------        
    mapping_dict : Dictionary
        dictionary mapping the Cell Ontology IDs (keys) to the encoded values (values)
        Values >= 0 are leaf nodes
        Values < 0 are internal nodes

    leaf_values : list
        list composed of all leaf values included in the dataset
        includes internal nodes that do not have sub-values in the dataset, and thus are
        treated an leaf nodes

    internal_values : list
        list composed of interanal nodes in the dataset

    ontology_df : pandas dataframe
        pandas dataframe where indices (rows) are all leaf and parent cell IDs from the portion of 
        the ontology being queried, and columns are all leafs in portion of ontology being queried. 
        
        Dataframe is binary. For each parent node, element = 1 if parent node is an ancestor
        of corresponding leaf node.
        
    parent_dict : dictionary
        keys are internal_values and values are all cell ontology terms within the same distance
        from the top node. 
        
    cell_parent_mask : tensor
        tensor of shape ik, where i = parent IDs and k = each cell type in the dataset
        binary tensor where 1 means for that cell type, that parent ID will be included
        in the internal loss calculation
        and 0 means for that cell type, that parent ID is excluded in the internal loss
        calculation

    
    '''
    
    # select the labels. 
    labels = obs_metadata[target_column]
    
    # encode the target column
    #lb = LabelEncoder()
    #labels['encoded_labels'] = lb.fit_transform(labels[target_column])
    
    # we want to only encode the targets that are leafs. We will leave 
    # internal nodes as the CL number in order to assist with masking 
    # the appropriate parent nodes 
    # first, get list of all cell values
    all_cell_values = labels[target_column].unique().to_list()
    
    # identify which values are leafs
    # we use positive number for leaf values
    # and negative number for internal nodes
    mapping_dict = {}
    leaf_values = []
    internal_values = []
    encoded_leaf_val = 0
    encoded_internal_val = -9999
    for term in all_cell_values:
        if cl[term].is_leaf():
            mapping_dict[term] = encoded_leaf_val
            leaf_values.append(term)
            encoded_leaf_val += 1
        else:
            # check if internal values have associated sub-values in the dataset
            #    sub-values do not have to be leafs
            # if so, add value as internal values
            # if not, prune ontology so consider 
            term_subvalues = []
            # get leaf values of this term
            for sub_term in cl[term].subclasses(distance=None,with_self=False).to_set():
                    term_subvalues.append(sub_term.id)
            
            # get values in all_call_values in term_leafs
            intersection_list = list(set(all_cell_values).intersection(term_subvalues))
            if len(intersection_list) == 0:
                mapping_dict[term] = encoded_leaf_val
                leaf_values.append(term)
                encoded_leaf_val += 1
            else:
                mapping_dict[term] = encoded_internal_val
                internal_values.append(term)
                encoded_internal_val += 1            
            
            
    # use the leaf_mapping_dict to 
    labels['encoded_labels'] = labels[target_column].map(mapping_dict)
    
    x_data = adata.X.copy()
    y_data = labels['encoded_labels']
    
    #########
    # now get a list of all parent nodes for each value in the dataset
    # if we want to include leafs, set with_self= True
    # else, set with_self = False
    
    all_parent_nodes = []
    for target in all_cell_values:
        for term in cl[target].superclasses(distance=None,with_self=include_leafs).to_set():
            all_parent_nodes.append(term.id)
            #if target == 'CL:0000904':
            #    print(term)
            
    # ensure that we do not have duplicate values
    all_parent_nodes = list(set(all_parent_nodes))

    # select only the Cell Ontology IDs if cl_only = True
    if cl_only:
        all_parent_nodes = [x for x in all_parent_nodes if x.startswith('CL')]
    
    # if there is an upper limit, 
    if upper_limit is not None:
        # get upper limit nodes
        upper_limit_nodes = []
        for term in cl[upper_limit].superclasses(distance=None,with_self=False).to_set():
            upper_limit_nodes.append(term.id)

        # remove these nodes from the parent_nodes list
        all_parent_nodes = [x for x in all_parent_nodes if x not in upper_limit_nodes]
        
    # create a dictionary that maps parents to reduce the ontology_df when
    # dealing with internal nodes
    #parent_dict = set_internal_node_relationships_by_depth(internal_values,upper_limit,all_parent_nodes)
    parent_dict = set_internal_node_values(internal_values,all_parent_nodes)
    
    # create the dataframe
    # use all_cell_values for the columns, because we need both leafs and
    # internals nodes for mapping
    ontology_df = pd.DataFrame(data=0, index = all_parent_nodes,
                                              columns = all_cell_values)
    
    # populate the dataframe with 1 if column is a sub-node 
    # for that particular cell ID
    # with_self = True because we need to include the leafs here
    for cell_id in ontology_df.index:
        for term in cl[cell_id].subclasses(distance=None,with_self=True).to_set():
            if term.id in ontology_df.columns:
                ontology_df.loc[cell_id,[term.id]] = [1]

    # create a dictionary that maps parents to reduce the ontology_df when
    # dealing with internal nodes
    #parent_dict = {}
    #for parent in internal_values:
    #    super_parent_list = []
    #    for term in cl[parent].superclasses(distance=None,with_self=True).to_set():
    #         if term.id in all_parent_nodes:
    #            super_parent_list.append(term.id)
    #    parent_dict[parent] = super_parent_list

    # build a matrix used to mask parent values
    cell_parent_mask = build_parent_mask(leaf_values,internal_values,ontology_df,parent_dict)
    
    return(mapping_dict, leaf_values, internal_values, ontology_df, parent_dict, cell_parent_mask)




## Main Loop For Preprocessing Data


In [28]:
cell_obs_metadata[target_column]

0          CL:0000763
1          CL:0000763
2          CL:0000763
3          CL:0000763
4          CL:0000542
              ...    
2726024    CL:0000860
2726025    CL:0000860
2726026    CL:0000860
2726027    CL:0000940
2726028    CL:0000623
Name: cell_type_ontology_term_id, Length: 2726029, dtype: object

In [27]:
target_column = 'cell_type_ontology_term_id'

upper_limit = 'CL:0000988' # leukocyte = 738, hematopoietic = 988

print('start preprocess data and ontology')
mapping_dict, leaf_values,internal_values, \
    ontology_df, parent_dict, cell_parent_mask =  preprocess_data_ontology(cell_obs_metadata, target_column,
                                                                           upper_limit = upper_limit, 
                                                                 cl_only = True, include_leafs = False)

###del adata

# create dataframe that only includes leaf nodes
ontology_leaf_df = ontology_df[leaf_values]


print('Preprocessing complete. There are {0} leaf values and {1} internal values.'.format(len(leaf_values),len(internal_values)
                                                                                         ))
print('There are {0} cells in the training set and {1} cells in the validation set, both contain {2} genes.'.format(X_train.shape[0],X_val.shape[0],X_train.shape[1]))


start preprocess data and ontology


KeyError: 'cell_type_ontology_term_id'