# Building New Models to Benchmark Against Celltypist
In this notebook, I create and train several models based on/using the Celltypist model. I also train these models on a couple different data sets. These models are then tested on the data they were trained on, as well as others in the 'Benchmarking Models' notebook. 

### Order of Models: 
0. Train the basic CellTypist model on this data for easy comparison with other models later
1. Remove the feature selection from CellTypist (so it only trains the model once)
2. Train the model with L1 regularization instead of L2
3. Train the model only once with only Cytopus genes
4. At the feature selection step, make sure the Cytopus genes are included in the list of top genes
5. Combine models 2 & 4. Use L1 during first step, then make sure Cytopus genes are included, then switch back to L2 regularization

### Data Used: 
CT_45 
- from Conde et al. 2022 ('Cross-tissue immune cell analysis reveals tissue-specific features in humans')
- CountAdded_PIP_global_object_for_cellxgene.h5ad
- models trained on this data are saved as 'ct_model_#.pkl'

CT_98
- see CT_45
- CellTypist_Immune_Reference_v2_count.h5ad
- models trained on this data are saved as '98_model_#.pkl'

COV_PBMC
- haniffa21.processed.h5ad
- models trained on this are saved as 'COV_model_#.pkl'

Glasner
- from Glasner et al. 2023 ('Conserved transcriptional connectivity of regulatory T cells in the tumor microenvironment informs new combination cancer therapy strategies')
- glasner_etal_globalAnndata_20230112.vHTA.h5ad + annotations from 'ad_endo_LS_20211026.results.h5ad', ad_fib_scranLogNorm_filt_20220113.h5ad', 'glasner_ad_myeloid_celltypist_20230606.h5ad'
- models trained on this are saved as 'g_model_#.pkl'

HBCA 
- from Kumar et al. 2023 ('A spatially resolved single-cell genomic atlas of the adult human breast')
- local.h5ad (renamed Kumar2023_breast.h5ad)
- models produced are saved as 'HBCA_model_#.pkl')

In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import numpy as np
import itertools
from anndata import AnnData
from scipy.sparse import spmatrix
from datetime import datetime
from typing import Optional, Union
from sklearn import __version__ as skv

import cytopus as cp

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import celltypist as ct #if its throwing an error with sklearn, install scikit-learn version 1.1.0 & that should fix
from celltypist import logger 
from celltypist.models import Model

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## Functions 
From celltypist/train.py with some edits/additions

In [2]:
def _to_vector(_vector_or_file):
    """
    For internal use. Turn a file into an array.
    """
    if isinstance(_vector_or_file, str):
        try:
            return pd.read_csv(_vector_or_file, header=None)[0].values
        except Exception as e:
            raise Exception(
                    f"🛑 {e}")
    else:
        return _vector_or_file

def _to_array(_array_like) -> np.ndarray:
    """
    For internal use. Turn an array-like object into an array.
    """
    if isinstance(_array_like, pd.DataFrame):
        return _array_like.values
    elif isinstance(_array_like, spmatrix):
        return _array_like.toarray()
    elif isinstance(_array_like, np.matrix):
        return np.array(_array_like)
    elif isinstance(_array_like, np.ndarray):
        return _array_like
    else:
        raise TypeError(
                f"🛑 Please provide a valid array-like object as input")

def _prepare_data(X, labels, genes, transpose) -> tuple:
    """
    For internal use. Prepare data for celltypist training.
    """
    if (X is None) or (labels is None):
        raise Exception(
                "🛑 Missing training data and/or training labels. Please provide both arguments")
    if isinstance(X, AnnData) or (isinstance(X, str) and X.endswith('.h5ad')):
        adata = sc.read(X) if isinstance(X, str) else X
        adata.var_names_make_unique()
        if adata.X.min() < 0:
            logger.info("👀 Detected scaled expression in the input data, will try the .raw attribute")
            try:
                indata = adata.raw.X
                genes = adata.raw.var_names
            except Exception as e:
                raise Exception(
                        f"🛑 Fail to use the .raw attribute in the input object. {e}")
        else:
            indata = adata.X
            genes = adata.var_names
        if isinstance(labels, str) and (labels in adata.obs):
            labels = adata.obs[labels]
        else:
            labels = _to_vector(labels)
    elif isinstance(X, str) and X.endswith(('.csv', '.txt', '.tsv', '.tab', '.mtx', '.mtx.gz')):
        adata = sc.read(X)
        if transpose:
            adata = adata.transpose()
        if X.endswith(('.mtx', '.mtx.gz')):
            if genes is None:
                raise Exception(
                        "🛑 Missing `genes`. Please provide this argument together with the input mtx file")
            genes = _to_vector(genes)
            if len(genes) != adata.n_vars:
                raise ValueError(
                        f"🛑 The number of genes provided does not match the number of genes in {X}")
            adata.var_names = np.array(genes)
        adata.var_names_make_unique()
        if not float(adata.X.max()).is_integer():
            logger.warn(f"⚠️ Warning: the input file seems not a raw count matrix. The trained model may be biased")
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        indata = adata.X
        genes = adata.var_names
        labels = _to_vector(labels)
    elif isinstance(X, str):
        raise ValueError(
                "🛑 Invalid input. Supported types: .csv, .txt, .tsv, .tab, .mtx, .mtx.gz and .h5ad")
    else:
        logger.info("👀 The input training data is processed as an array-like object")
        indata = X
        if transpose:
            indata = indata.transpose()
        if isinstance(indata, pd.DataFrame):
            genes = indata.columns
        else:
            if genes is None:
                raise Exception(
                        "🛑 Missing `genes`. Please provide this argument together with the input training data")
            genes = _to_vector(genes)
        labels = _to_vector(labels)
    return indata, labels, genes

def _SGDClassifier(indata, labels,
                   alpha, max_iter, n_jobs,
                   mini_batch, batch_number, batch_size, epochs, balance_cell_type, penalty , **kwargs) -> SGDClassifier:
    """
    For internal use 
    
    ONE NEW ARG
    penalty
        allows to user specify what type of regularization
    """
    loss_mode = 'log_loss' if float(skv[:3]) >= 1.1 else 'log'
    classifier = SGDClassifier(loss = loss_mode, penalty = penalty, alpha = alpha, max_iter = max_iter, n_jobs = n_jobs, **kwargs)
    if not mini_batch:
        logger.info(f"🏋️ Training data using SGD logistic regression")
        if (len(labels) > 100000) and (indata.shape[1] > 10000):
            logger.warn(f"⚠️ Warning: it may take a long time to train this dataset with {len(labels)} cells and {indata.shape[1]} genes, try to downsample cells and/or restrict genes to a subset (e.g., hvgs)")
        classifier.fit(indata, labels)
    else:
        logger.info(f"🏋️ Training data using mini-batch SGD logistic regression")
        no_cells = len(labels)
        if no_cells < 10000:
            logger.warn(f"⚠️ Warning: the number of cells ({no_cells}) is not big enough to conduct a proper mini-batch training. You may consider using traditional SGD classifier (mini_batch = False)")
        if no_cells <= batch_size:
            raise ValueError(
                    f"🛑 Number of cells ({no_cells}) is fewer than the batch size ({batch_size}). Decrease `batch_size`, or use SGD directly (mini_batch = False)")
        no_cells_sample = min([batch_number*batch_size, no_cells])
        starts = np.arange(0, no_cells_sample, batch_size)
        if balance_cell_type:
            celltype_freq = np.unique(labels, return_counts = True)
            len_celltype = len(celltype_freq[0])
            mapping = pd.Series(1 / (celltype_freq[1]*len_celltype), index = celltype_freq[0])
            p = mapping[labels].values
        for epoch in range(1, (epochs+1)):
            logger.info(f"⏳ Epochs: [{epoch}/{epochs}]")
            if not balance_cell_type:
                sampled_cell_index = np.random.choice(no_cells, no_cells_sample, replace = False)
            else:
                sampled_cell_index = np.random.choice(no_cells, no_cells_sample, replace = False, p = p)
            for start in starts:
                classifier.partial_fit(indata[sampled_cell_index[start:start+batch_size]], labels[sampled_cell_index[start:start+batch_size]], classes = np.unique(labels))
    return classifier

def train_1(X = None,
          labels: Optional[Union[str, list, tuple, np.ndarray, pd.Series, pd.Index]] = None,
          genes: Optional[Union[str, list, tuple, np.ndarray, pd.Series, pd.Index]] = None,
          transpose_input: bool = False,
          with_mean: bool = True,
          check_expression: bool = True,
          #LR param
          C: float = 1.0, solver: Optional[str] = None, max_iter: Optional[int] = None, n_jobs: Optional[int] = None,
          #SGD param
          use_SGD: bool = False, alpha: float = 0.0001,
          #mini-batch
          mini_batch: bool = False, batch_number: int = 100, batch_size: int = 1000, epochs: int = 10, balance_cell_type: bool = False,
          #feature selection
          feature_selection: bool = False, top_genes: int = 300, use_cytopus: bool = False, cyto_genes: Optional[np.ndarray] = None,
          #description
          date: str = '', details: str = '', url: str = '', source: str = '', version: str = '',
          #penalty
          penalty: str = 'l2', switch_penalty: bool = False,
          #other param
          **kwargs) -> Model: 
    """
    Train a celltypist model using mini-batch (optional) logistic classifier with a global solver or stochastic gradient descent (SGD) learning. 
    A version of the celltypist fxn train that adds some additional choices/functions 

    Parameters
    ----------
    X
        Path to the input count matrix (supported types are csv, txt, tsv, tab and mtx) or AnnData (h5ad).
        Also accepts the input as an :class:`~anndata.AnnData` object, or any array-like objects already loaded in memory.
        See `check_expression` for detailed format requirements.
        A cell-by-gene format is desirable (see `transpose_input` for more information).
    labels
        Path to the file containing cell type label per line corresponding to the cells in `X`.
        Also accepts any list-like objects already loaded in memory (such as an array).
        If `X` is specified as an AnnData, this argument can also be set as a column name from cell metadata.
    genes
        Path to the file containing one gene per line corresponding to the genes in `X`.
        Also accepts any list-like objects already loaded in memory (such as an array).
        Note `genes` will be extracted from `X` where possible (e.g., `X` is an AnnData or data frame).
    transpose_input
        Whether to transpose the input matrix. Set to `True` if `X` is provided in a gene-by-cell format.
        (Default: `False`)
    with_mean
        Whether to subtract the mean values during data scaling. Setting to `False` can lower the memory usage when the input is a sparse matrix but may slightly reduce the model performance.
        (Default: `True`)
    check_expression
        Check whether the expression matrix in the input data is supplied as required.
        Except the case where a path to the raw count table file is specified, all other inputs for `X` should be in log1p normalized expression to 10000 counts per cell.
        Set to `False` if you want to train the data regardless of the expression formats.
        (Default: `True`)
    C
        Inverse of L2 regularization strength for traditional logistic classifier. A smaller value can possibly improve model generalization while at the cost of decreased accuracy.
        This argument is ignored if SGD learning is enabled (`use_SGD = True`).
        (Default: 1.0)
    solver
        Algorithm to use in the optimization problem for traditional logistic classifier.
        The default behavior is to choose the solver according to the size of the input data.
        This argument is ignored if SGD learning is enabled (`use_SGD = True`).
    max_iter
        Maximum number of iterations before reaching the minimum of the cost function.
        Try to decrease `max_iter` if the cost function does not converge for a long time.
        This argument is for both traditional and SGD logistic classifiers, and will be ignored if mini-batch SGD training is conducted (`use_SGD = True` and `mini_batch = True`).
        Default to 200, 500, and 1000 for large (>500k cells), medium (50-500k), and small (<50k) datasets, respectively.
    n_jobs
        Number of CPUs used. Default to one CPU. `-1` means all CPUs are used.
        This argument is for both traditional and SGD logistic classifiers.
    use_SGD
        Whether to implement SGD learning for the logistic classifier.
        (Default: `False`)
    alpha
        L2 regularization strength for SGD logistic classifier. A larger value can possibly improve model generalization while at the cost of decreased accuracy.
        This argument is ignored if SGD learning is disabled (`use_SGD = False`).
        (Default: 0.0001)
    mini_batch
        Whether to implement mini-batch training for the SGD logistic classifier.
        Setting to `True` may improve the training efficiency for large datasets (for example, >100k cells).
        This argument is ignored if SGD learning is disabled (`use_SGD = False`).
        (Default: `False`)
    batch_number
        The number of batches used for training in each epoch. Each batch contains `batch_size` cells.
        For datasets which cannot be binned into `batch_number` batches, all batches will be used.
        This argument is relevant only if mini-batch SGD training is conducted (`use_SGD = True` and `mini_batch = True`).
        (Default: 100)
    batch_size
        The number of cells within each batch.
        This argument is relevant only if mini-batch SGD training is conducted (`use_SGD = True` and `mini_batch = True`).
        (Default: 1000)
    epochs
        The number of epochs for the mini-batch training procedure.
        The default values of `batch_number`, `batch_size`, and `epochs` together allow observing ~10^6 training cells.
        This argument is relevant only if mini-batch SGD training is conducted (`use_SGD = True` and `mini_batch = True`).
        (Default: 10)
    balance_cell_type
        Whether to balance the cell type frequencies in mini-batches during each epoch.
        Setting to `True` will sample rare cell types with a higher probability, ensuring close-to-even cell type distributions in mini-batches.
        This argument is relevant only if mini-batch SGD training is conducted (`use_SGD = True` and `mini_batch = True`).
        (Default: `False`)
    feature_selection
        Whether to perform two-pass data training where the first round is used for selecting important features/genes using SGD learning.
        If `True`, the training time will be longer.
        (Default: `False`)
    top_genes
        The number of top genes selected from each class/cell-type based on their absolute regression coefficients.
        The final feature set is combined across all classes (i.e., union).
        (Default: 300)
    date
        Free text of the date of the model. Default to the time when the training is completed.
    details
        Free text of the description of the model.
    url
        Free text of the (possible) download url of the model.
    source
        Free text of the source (publication, database, etc.) of the model.
    version
        Free text of the version of the model.
    **kwargs
        Other keyword arguments passed to :class:`~sklearn.linear_model.LogisticRegression` (`use_SGD = False`) or :class:`~sklearn.linear_model.SGDClassifier` (`use_SGD = True`).
        
    FOUR NEW PARAMETERS: 
    penalty
        Which regularization method to use 
        (Default: "l2")
    switch_penalty
        Whether to switch the type of regualarization for the second round of model training 
        (Default: False)
    use_cytopus 
        Whether to confirm if cytopus genes are included in feature_selection (they are added if not)
        This argument is relevant only if feature selection happens (`feature_selection = True`) 
        (Default: False)
    cyto_genes
        List of gene names from ctyopus cell identities dictionary
        This argument is relevant only if feature selection with cytopus genes happens (`feature_selection = True` and `use_cytopus = True`) 

    Returns
    ----------
    :class:`~celltypist.models.Model`
        An instance of the :class:`~celltypist.models.Model` trained by celltypist.
    """
    #prepare
    logger.info("🍳 Preparing data before training")
    indata, labels, genes = _prepare_data(X, labels, genes, transpose_input)
    if isinstance(indata, pd.DataFrame):
        indata = indata.values
    elif with_mean and isinstance(indata, spmatrix):
        indata = indata.toarray()
    labels = np.array(labels)
    genes = np.array(genes)
    #check
    ##NEED TO CHANGE 10000 TO MEDIAN AMOUNT 
    if check_expression and (np.abs(np.expm1(indata[0]).sum()-10000) > 1):
        raise ValueError(
                "🛑 Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell")
    if len(labels) != indata.shape[0]:
        raise ValueError(
                f"🛑 Length of training labels ({len(labels)}) does not match the number of input cells ({indata.shape[0]})")
    if len(genes) != indata.shape[1]:
        raise ValueError(
                f"🛑 The number of genes ({len(genes)}) provided does not match the number of genes in the training data ({indata.shape[1]})")
    #filter
    flag = indata.sum(axis = 0) == 0
    if isinstance(flag, np.matrix):
        flag = flag.A1
    if flag.sum() > 0:
        logger.info(f"✂️ {flag.sum()} non-expressed genes are filtered out")
        #indata = indata[:, ~flag]
        genes = genes[~flag]
    #report data stats
    logger.info(f"🔬 Input data has {indata.shape[0]} cells and {(~flag).sum()} genes")
    #scaler
    logger.info(f"⚖️ Scaling input data")
    scaler = StandardScaler(with_mean = with_mean)
    indata = scaler.fit_transform(indata[:, ~flag] if flag.sum() > 0 else indata)
    indata[indata > 10] = 10
    #sklearn (Cython) does not support very large sparse matrices for the time being
    if isinstance(indata, spmatrix) and ((indata.indices.dtype == 'int64') or (indata.indptr.dtype == 'int64')):
        indata = indata.toarray()
    #max_iter
    if max_iter is None:
        if indata.shape[0] < 50000:
            max_iter = 1000
        elif indata.shape[0] < 500000:
            max_iter = 500
        else:
            max_iter = 200
    #classifier
    if use_SGD or feature_selection:
        classifier = _SGDClassifier(indata = indata, labels = labels, alpha = alpha, max_iter = max_iter, n_jobs = n_jobs, mini_batch = mini_batch, batch_number = batch_number, batch_size = batch_size, epochs = epochs, balance_cell_type = balance_cell_type, penalty = penalty, **kwargs)
    else:
        classifier = _LRClassifier(indata = indata, labels = labels, C = C, solver = solver, max_iter = max_iter, n_jobs = n_jobs, **kwargs)
    #feature selection -> new classifier and scaler
    if feature_selection:
        logger.info(f"🔎 Selecting features")
        if len(genes) <= top_genes:
            raise ValueError(
                    f"🛑 The number of genes ({len(genes)}) is fewer than the `top_genes` ({top_genes}). Unable to perform feature selection")
        gene_index = np.argpartition(np.abs(classifier.coef_), -top_genes, axis = 1)[:, -top_genes:]
        gene_index = np.unique(gene_index)
        if use_cytopus: 
            logger.info(f"🧬 {len(gene_index)} features are selected pre cytopus")
            #confirming that all cytopus genes are in the top genes used in feature selection
            #first get a list of all the indexs of cyto_genes 
            ct_gene_index = []
            for x in cyto_genes:
                if x in genes: 
                    idx = np.where(genes==x)[0][0]
                    ct_gene_index.append(idx)
            for x in ct_gene_index: 
                if x not in gene_index: 
                    gene_index = np.append(gene_index, x)
            gene_index = np.unique(gene_index)
            logger.info(f"🧬 {len(gene_index)} features are selected after cytopus")
        else:
            logger.info(f"🧬 {len(gene_index)} features are selected")
        genes = genes[gene_index]
        #indata = indata[:, gene_index]
        logger.info(f"🏋️ Starting the second round of training")
        if switch_penalty: 
            if penalty == "l2":
                penalty = "l1"
            else: 
                penalty = "l2"
        if use_SGD:
            classifier = _SGDClassifier(indata = indata[:, gene_index], labels = labels, alpha = alpha, max_iter = max_iter, n_jobs = n_jobs, mini_batch = mini_batch, batch_number = batch_number, batch_size = batch_size, epochs = epochs, balance_cell_type = balance_cell_type, penalty = penalty, **kwargs)
        else:
            classifier = _LRClassifier(indata = indata[:, gene_index], labels = labels, C = C, solver = solver, max_iter = max_iter, n_jobs = n_jobs, **kwargs)
        scaler.mean_ = scaler.mean_[gene_index]
        scaler.var_ = scaler.var_[gene_index]
        scaler.scale_ = scaler.scale_[gene_index]
        scaler.n_features_in_ = len(gene_index)
    #model finalization
    classifier.features = genes
    classifier.n_features_in_ = len(genes)
    if not date:
        date = str(datetime.now())
    description = {'date': date, 'details': details, 'url': url, 'source': source, 'version': version, 'number_celltypes': len(classifier.classes_)}
    logger.info(f"✅ Model training done!")
    return Model(classifier, scaler, description)

In [3]:
def train_test_split(adata, frac: int = 0.7):
    """
    USING OUTLINE OF CODE FROM trVAE https://doi.org/10.1093/bioinformatics/btaa800
    Split AnnData into test and train datasets - maintains annotations
    
    Params: 
    adata
        Annotated data matrix (Anndata)
    frac
        Fraction of cells to be used in the training set
    """
    no_idx_train = int(adata.shape[0] * frac)
    indices = np.arange(adata.shape[0])
    np.random.shuffle(indices)
    train_index = indices[:no_idx_train]
    test_index = indices[no_idx_train:]
    train = adata[train_index]
    test = adata[test_index]
    return train, test

In [4]:
#fxn to make individual models 
def make_model(model_ver: int = 0, 
              X = None,
              labels: Optional[Union[str, list, tuple, np.ndarray, pd.Series, pd.Index]] = None,
              genes: Optional[Union[str, list, tuple, np.ndarray, pd.Series, pd.Index]] = None,
              check_expression: bool = False,
              cyto_genes: Optional[np.ndarray] = None,
              write_loc: str = 'New Models') -> Model: 
    """
    mode_ver 
        Which type of model to make 
        (Default: 0)
    X
        Path to the input count matrix (supported types are csv, txt, tsv, tab and mtx) or AnnData (h5ad).
        Also accepts the input as an :class:`~anndata.AnnData` object, or any array-like objects already loaded in memory.
        See `check_expression` for detailed format requirements.
        A cell-by-gene format is desirable (see `transpose_input` for more information).
    labels
        Path to the file containing cell type label per line corresponding to the cells in `X`.
        Also accepts any list-like objects already loaded in memory (such as an array).
        If `X` is specified as an AnnData, this argument can also be set as a column name from cell metadata.
    genes
        Path to the file containing one gene per line corresponding to the genes in `X`.
        Also accepts any list-like objects already loaded in memory (such as an array).
        Note `genes` will be extracted from `X` where possible (e.g., `X` is an AnnData or data frame).
    check_expression
        Check whether the expression matrix in the input data is supplied as required by celltypist.
        `X` should be in log1p normalized expression to 10000 counts per cell.
        (Default: `False`)
    cyto_genes
        (For model 4) A list of genes to make sure are included in feature selection
    write_loc
        Where to save the newly made model 
        (Default: 'New Models' - directory in GitHub)
    """
    if model_ver == 0:
        #vanilla celltypist
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True, balance_cell_type = True, feature_selection = True)
    
    if model_ver == 1:
        #no fs
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True)
    
    if model_ver == 2:
        #L1 reg
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True, feature_selection = True, penalty = "l1")
    
    if model_ver == 3: 
        #cytopus genes only
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True)
    
    if model_ver == 4: 
        #fs with cytopus genes
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True, feature_selection = True, balance_cell_type = True, use_cytopus = True, cyto_genes = cyto_genes)
    
    if model_ver == 5: 
        #merge of 2 & 4 
        model = train_1(X = X, labels = labels, genes = genes, check_expression = check_expression, use_SGD = True, mini_batch = True, feature_selection = True, penalty = "l1", switch_penalty = True, balance_cell_type = True, use_cytopus = True, cyto_genes = cyto_genes)
    model.write(write_loc)
    return model 

In [5]:
def make_all_models(adata, annot_col, abrev,  percent_train: int = 0.7, check_expression: bool = False, data_dir: str = '/data/peer/adamsj5/cell_typing/train_test_data/', write_loc: str = 'New Models/', train_data: Optional[str] = None):
    """
    This function makes all of the models defined for this cell typing benchmark for one particular dataset. 
    adata - AnnData dataset of interest. 
        Make sure the data has gone through desired preprocessing and is in the format required (only gene counts (no protein), normalized to median library size (prefered) or 10,000 counts per cell, log transformed with pseudeocount of 1, etc)
    annot_col - name of the column in adata that holds the groundtruth cell type annotationg
    abrev - an abreviation for this dataset to identify later 
    precent_train - what percent of the dataset you want to section of for training (the rest will be set aside as test data)
        (Default: 0.7)
    check_expression - whether or not to use CellTypist's check expression function 
        It will throw an error if the data is not normalized to 10,000 counts per cell and log transformed with a pseudeo count of 1 
        (Default: False)
    data_dir - the directory to save the split train & test data 
        (Default: '/data/peer/adamsj5/cell_typing/train_test_data/' - Jo Adams' directory on lilac/calcifer)
    write_loc - where to save the newly made model 
        (Default: 'New Models/')
    train_data - Location of AnnData of interest that has already been split into train & test data. This is optional; it is to allow for the same dataset to be used for training if one already exists.
    """
    #split into train & tetst & get all needed vars for model training
    if train_data == None:
        train, test = train_test_split(adata, percent_train)
        
        #save train/test split data
        test_loc = data_dir+abrev+'_Test.h5ad'
        train_loc = data_dir+abrev+'_Train.h5ad'

        test.write(test_loc)
        train.write(train_loc)
    else: 
        train = ad.read(train_data)
        
    indata = train.X
    labels = train.obs[annot_col]
    genes = train.var_names
    
    cp_and_ct_genes = [x for x in cp_genes if x in adata.var_names]
    cp_and_ct_genes = np.unique(cp_and_ct_genes)
    
    train_cp = train[:, cp_and_ct_genes]
    indata_cp = train_cp.X
    labels_cp = train_cp.obs[annot_col]
    genes_cp = train_cp.var_names
   
    #acutually make all the models
    write_loc_0 = write_loc+abrev+'_model_0'
    model_0 = make_model(X = indata, labels = labels, genes = genes, check_expression = check_expression, write_loc = write_loc_0)
    print("Model 0 Done")
    
    
    write_loc_2 = write_loc+abrev+'_model_2'
    model_2 = make_model(model_ver = 2, X = indata, labels = labels, genes = genes, check_expression = check_expression, write_loc = write_loc_2)
    print("Model 2 Done")
    
    write_loc_3 = write_loc+abrev+'_model_3'
    model_3 = make_model(model_ver = 3, X = indata_cp, labels = labels_cp, genes = genes_cp, check_expression = check_expression, write_loc = write_loc_3)
    print("Model 3 Done")
    
    write_loc_4 = write_loc+abrev+'_model_4'
    model_4 = make_model(model_ver = 4, X = indata, labels = labels, genes = genes, check_expression = check_expression, cyto_genes = cp_and_ct_genes, write_loc = write_loc_4)
    print("Model 4 Done")

    write_loc_5 = write_loc+abrev+'_model_5'
    model_5 = make_model(model_ver = 5, X = indata, labels = labels, genes = genes, check_expression = check_expression, cyto_genes = cp_and_ct_genes, write_loc = write_loc_5)
    print("Model 5 Done")
    print("All Models Done")
    
    

## Data
Loading cytopus cell type dictionary

In [6]:
G = cp.kb.KnowledgeBase()
cell_dict = G.identities

#make a list of genes from cytopus dict & remove NaNs
#the celltype information doesn't need to be retained since we're applying this gene list to all celltypes
cp_genes = []
for i in cell_dict.values():
    cp_genes.append(i)
cp_genes = list(itertools.chain(*cp_genes)) #make flatlist out of LoL
cp_genes = [x for x in cp_genes if str(x) != 'nan']

KnowledgeBase object containing 75 cell types and 201 cellular processes



### CT_45
Loading in data from celltypist

In [None]:
#adatact_45 = ad.read('../../Data/CountAdded_PIP_global_object_for_cellxgene.h5ad') #local location
adatact_45 = ad.read('/data/peer/adamsj5/cell_typing/CountAdded_PIP_global_object_for_cellxgene.h5ad') #lilac location
#sc.pp.subsample(adata, n_obs = 75000)

In [None]:
adatact_45.obs["Manually_curated_celltype"].cat.categories

In [None]:
#trainct_45, testct_45 = train_test_split(adatact_45, 0.3)
trainct_45 = ad.read('/data/peer/adamsj5/cell_typing/train_test_data/CT_45_Train.h5ad')
indatact_45 = trainct_45.X
labelsct_45 = trainct_45.obs["Manually_curated_celltype"]
genesct_45 = trainct_45.var_names

Making a data table that only includes genes from cytopus (for model 3)

In [None]:
cp_and_ct_genes_45 = [x for x in cp_genes if x in trainct_45.var_names]
cp_and_ct_genes_45 = np.unique(cp_and_ct_genes_45)

In [None]:
trainct_45_cp = trainct_45[:, cp_and_ct_genes_45]
indatact_45_cp = trainct_45_cp.X
labelsct_45_cp = trainct_45_cp.obs["Manually_curated_celltype"]
genesct_45_cp = trainct_45_cp.var_names

In [None]:
trainct_45.write_h5ad('/data/peer/adamsj5/cell_typing/train_test_data/CT_45_Train.h5ad') #lilac location
testct_45.write_h5ad('/data/peer/adamsj5/cell_typing/train_test_data/CT_45_Test.h5ad')

### CT_98
v2 of CellTypist training data that I believe the Immune_All models were trained on. 

In [None]:
#adatact_98 = ad.read('../../Data/CellTypist_Immune_Reference_v2_count.h5ad') #local location
adatact_98 = ad.read('/data/peer/adamsj5/cell_typing/CellTypist_Immune_Reference_v2_count.h5ad') #lilac location

This data is not normalized or transformed. In order to get it to a place where we can train the models, we need to normalize the counts from each cell and transform. CellTypist wants data normalized to 10,000 counts per cell. However, it has been shown that this is not the best technique (Ahlmann-Eltze & Huber, 2023). For our models, we will recommend normalizing to the median library size. You can override CellTypist's expected expression by setting the argument 'check_expression' in the train function to False. After normalizing, the standard is to log transform the data with a pseudocount of 1. 

In [None]:
#find median library size & normalize to that value
lib_size = []
for i in range(675607):
    col_sum = adatact_98[i].X.sum()
    lib_size.append(col_sum)

In [None]:
med_ls = np.median(lib_size) #4725
#med_ls = 4725

In [None]:
sc.pp.normalize_total(adatact_98)

In [None]:
adatact_98.X[0].sum()

In [None]:
#log transform
adatact_98.X= np.log1p(adatact_98.X)

In [None]:
#check that the data looks ok (want this to be less than 1): 
np.abs(np.expm1(adatact_98.X[0]).sum()-med_ls) 

In [None]:
#trainct_98, testct_98 = train_test_split(adatact_98, 0.2)
trainct_98 = ad.read('/data/peer/adamsj5/cell_typing/train_test_data/CT_98_train.h5ad')
indatact_98 = trainct_98.X
labelsct_98 = trainct_98.obs["Harmonised_detailed_type"]
genesct_98 = trainct_98.var_names

In [None]:
cp_and_ct_genes_98 = [x for x in cp_genes if x in trainct_98.var_names]
cp_and_ct_genes_98 = np.unique(cp_and_ct_genes_98)

In [None]:
trainct_98_cp = trainct_98[:, cp_and_ct_genes_98]
indatact_98_cp = trainct_98_cp.X
labelsct_98_cp = trainct_98_cp.obs["Harmonised_detailed_type"]
genesct_98_cp = trainct_98_cp.var_names

In [None]:
trainct_98.write_h5ad('/data/peer/adamsj5/cell_typing/train_test_data/CT_98_train.h5ad') #lilac location
testct_98.write_h5ad('/data/peer/adamsj5/cell_typing/train_test_data/CT_98_Test.h5ad') #lilac location

### COV_PBMC
The last 192 features in this matrix are antibodies, not genes. The matrix is normalized to the number of counts per gene, excluding the antibodies and since we don't want to include antibodies in our model either, I will be removing those columns. 

In [7]:
#adata_COV = ad.read('../../Data/haniffa21.processed.h5ad') #local location
adata_COV = ad.read('/data/peer/adamsj5/cell_typing/haniffa21.processed.h5ad') #lilac location

In [8]:
#remove antibody columns
rna_only = [j for j in adata_COV.var_names if 'AB_' not in j]
rna_col_id = [adata_COV.var_names.get_loc(j) for j in rna_only]
adata_COV = adata_COV[:,np.asarray(rna_col_id)]

In [None]:
#train_COV, test_COV = train_test_split(adata_COV, 0.2)
train_COV = ad.read('/data/peer/adamsj5/cell_typing/train_test_data/COV_Train.h5ad')
indata_COV = train_COV.X
labels_COV = train_COV.obs["full_clustering"]
genes_COV = train_COV.var_names

In [None]:
cp_and_ct_genes_COV = [x for x in cp_genes if x in train_COV.var_names]
cp_and_ct_genes_COV = np.unique(cp_and_ct_genes_COV)

In [None]:
train_COV_cp = train_COV[:, cp_and_ct_genes_COV]
indata_COV_cp = train_COV_cp.X
labels_COV_cp = train_COV_cp.obs["full_clustering"]
genes_COV_cp = train_COV_cp.var_names

In [None]:
train_COV.write('/data/peer/adamsj5/cell_typing/train_test_data/train_COV.h5ad')# lilac location
test_COV.write('/data/peer/adamsj5/cell_typing/train_test_data/test_COV.h5ad')# lilac location

### Glasner

This dataset combines the cell type labels from 4 datasets: the overall coarsely annotated data, finely annotated endothelial cells data, finely annotated fibroblast data, and finely annotated myeloid cell data. The coarse dataset is the "base" that the other annotations were added to. Because most immune cell types only have very high level labels, and cytopus only contains information about immune cells, at a with a much high resolution of labels, I won't make models 3 & 4 for this data, which rely on that information.  

Additionally, this dataset it normalized to the median library sized and then log transformed with a pseudocount of 0.1 (not 1). 

##### To integrate cell type labels

In [None]:
#adata_g = ad.read('../../Data/glasner_etal_globalAnndata_20230112.vHTA.h5ad') #annotations too coarse, local location
adata_g = ad.read('../../Data/cell_typing/glasner_etal_globalAnndata_20230112.vHTA.h5ad') #annotations too coarse, lilac location

In [None]:
adata_g

In [None]:
adata_g.var = adata_g.var.set_index('gene_name')

In [None]:
#adata_g_endo = ad.read('../../Data/ad_endo_LS_20211026.results.h5ad') #local location
#adata_g_fib = ad.read('../../Data/ad_fib_scranLogNorm_filt_20220113.h5ad') #local location
#adata_g_myl = ad.read('../../Data/glasner_ad_myeloid_celltypist_20230606.h5ad') #local location

adata_g_endo = ad.read('../../Data/cell_typing/ad_endo_LS_20211026.results.h5ad') #lilac location
adata_g_fib = ad.read('../../Data/cell_typing/ad_fib_scranLogNorm_filt_20220113.h5ad') #lilac location
adata_g_myl = ad.read('../../Data/cell_typing/glasner_ad_myeloid_celltypist_20230606.h5ad') #lilac location

In [None]:
adata_glas = adata_g.copy()

In [None]:
finer_cell_types = []
orig_cell_types = [] 

for x in adata_g.obs_names:
    g_idx = np.where(adata_g.obs_names == x)[0][0]
    orig_cell_types.append(adata_glas[g_idx].obs["cell_lineage"].values[0])
    if x in adata_g_endo.obs_names:
        endo_idx = np.where(adata_g_endo.obs_names == x)[0][0]
        finer_cell_types.append(adata_g_endo[:,endo_idx].obs["granular_cell_type"].values[0]
    elif x in adata_g_myl.obs_names:
        myl_idx = np.where(adata_g_myl.obs_names == x)[0][0]
        finer_cell_types.append(adata_g_myl[myl_idx].obs["cell_type"].values[0])
    elif x in adata_g_fib.obs_names:
        fib_idx = np.where(adata_g_fib.obs_names == x)[0][0]
        finer_cell_types.append(adata_g_fib[fib_idx].obs["granular_cell_type"].values[0])
    else:
        finer_cell_types.append(adata_glas[g_idx].obs["cell_lineage"].values[0])

In [None]:
adata_glas.obs["finer_cell_types"] = finer_cell_types
adata_glas.obs["orig_cell_types"] = orig_cell_types

In [None]:
#confirm that theyre in generally the right order
f1_score(adata_glas.obs["cell_lineage"], adata_glas.obs["orig_cell_types"], average = None)

In [None]:
adata_glas.write('../../Data/cell_typing/glasner_fine_annot.h5ad')

##### More granular dataset: 

In [None]:
adata_glas = ad.read('/data/peer/adamsj5/cell_typing/glasner_fine_annot.h5ad') #lilac location

In [None]:
#train_glas, test_glas = train_test_split(adata_glas)
train_glas = ad.read('/data/peer/adamsj5/cell_typing/train_glas.h5ad')
indata_glas = train_glas.X
labels_glas = train_glas.obs['finer_cell_types']
genes_glas = train_glas.var_names

In [None]:
cp_and_ct_genes_glas = [x for x in cp_genes if x in train_glas.var_names]
cp_and_ct_genes_glas = np.unique(cp_and_ct_genes_glas)

In [None]:
train_glas_cp = train_glas[:, cp_and_ct_genes_glas]
indata_glas_cp = train_glas_cp.X
labels_glas_cp = train_glas_cp.obs["finer_cell_types"]
genes_glas_cp = train_glas_cp.var_names

In [None]:
train_glas.write('/data/peer/adamsj5/cell_typing/train_glas.h5ad')# lilac location
test_glas.write('/data/peer/adamsj5/cell_typing/test_glas.h5ad')# lilac location

### HBCA

In [None]:
adata_HBCA = ad.read('/data/peer/adamsj5/cell_typing/Kumar2023_breast.h5ad')

In [None]:
adata_HBCA.var = adata_HBCA.var.set_index('feature_name')

## Models - All at Once

In [None]:
make_all_models(adata_45, annot_col = 'Manually_curated_celltype', abrev = 'CT_45', percent_train = 0.20 , write_loc = 'New Models/CT_45 Models/', train_data = '/data/peer/adamsj5/cell_typing/train_test_data/CT_45_Train.h5ad')


In [None]:
make_all_models(adata_98, annot_col = 'Harmonised_detailed_type', abrev = 'CT_98', percent_train = 0.20 , write_loc = 'New Models/CT_98 Models/', train_data = '/data/peer/adamsj5/cell_typing/train_test_data/CT_98_train.h5ad')


In [None]:
make_all_models(adata_COV, annot_col = 'full_clustering', abrev = 'COV', percent_train = 0.20, write_loc = 'New Models/COV_PBMC Models/', train_data = '/data/peer/adamsj5/cell_typing/train_test_data/COV_Train.h5ad')


🍳 Preparing data before training
👀 The input training data is processed as an array-like object
✂️ 1224 non-expressed genes are filtered out
🔬 Input data has 129473 cells and 23513 genes
⚖️ Scaling input data
🏋️ Training data using mini-batch SGD logistic regression
⏳ Epochs: [1/10]
⏳ Epochs: [2/10]
⏳ Epochs: [3/10]
⏳ Epochs: [4/10]
⏳ Epochs: [5/10]
⏳ Epochs: [6/10]
⏳ Epochs: [7/10]
⏳ Epochs: [8/10]


In [None]:
make_all_models(adata_glas, annot_col = 'finer_cell_types', abrev = 'g', write_loc = 'New Models/Glasner Models/', train_data = '/data/peer/adamsj5/cell_typing/train_test_data/train_glas.h5ad')


In [None]:
make_all_models(adata_HBCA, annot_col = 'cell_type', abrev = 'HBCA', percent_train = 0.15, write_loc = 'New Models/HBCA Models/', train_data = '/data/peer/adamsj5/cell_typing/train_test_data/HBCA_Train.h5ad')


## Models - One At A Time

### Model 0
Retrain basic celltypist model on this data

In [None]:
model_ct = make_model(model_ver = 0, X = indatact_45, labels = labelsct_45, genes = genesct_45, check_expression = True, write_loc = 'New Models/CT_45 Models/ct_model_0')

In [None]:
model_ct_98 = make_model(model_ver = 0, X = indatact_98, labels = labelsct_98, genes = genesct_98, write_loc = 'New Models/CT_98 Models/98_model_0')

In [None]:
model_COV = make_model(model_ver = 0, X = indata_COV, labels = labels_COV, genes = genes_COV, write_loc = 'New Models/COV_PBMC Models/COV_model_0')

In [None]:
#model_g = make_model(model_ver = 0, X = indata_glas, labels = labels_glas, genes = genes_glas, write_loc = 'New Models/Glasner Models/g_model_0')
model_g = train_1(X = indata_glas, labels = labels_glas, genes = genes_glas, check_expression = False, use_SGD = True, mini_batch = True, balance_cell_type = True, feature_selection = True)
model_g.write('New Models/Glasner Models/g_model_0')

### Model 1
Train only once

In [None]:
model_nofs = make_model(model_ver = 1, X = indatact_45, labels = labelsct_45, genes = genesct_45, check_expression = True, write_loc = 'New Models/CT_45 Models/ct_model_1')

In [None]:
model_nofs_98 = make_model(model_ver = 1, X = indatact_98, labels = labelsct_98, genes = genesct_98, write_loc = 'New Models/CT_98 Models/98_model_1')

In [None]:
model_nofs_COV = make_model(model_ver = 1, X = indata_COV, labels = labels_COV, genes = genes_COV, write_loc = 'New Models/COV_PBMC Models/COV_model_1')

### Model 2
Use L1 regularization instead of L2

In [None]:
model_L1 = make_model(model_ver = 2, X = indatact_45, labels = labelsct_45, genes = genesct_45, check_expression = True, write_loc = 'New Models/CT_45 Models/ct_model_2')

In [None]:
model_L1_98 = make_model(model_ver = 2, X = indatact_98, labels = labelsct_98, genes = genesct_98, write_loc = 'New Models/CT_98 Models/98_model_2')

In [None]:
model_L1_COV = make_model(model_ver = 2, X = indata_COV, labels = labels_COV, genes = genes_COV, write_loc = 'New Models/COV_PBMC Models/COV_model_2')

In [None]:
model_L1_g = make_model(model_ver = 2, X = indata_glas, labels = labels_glas, genes = genes_glas, write_loc = 'New Models/Glasner Models/g_model_2')

### Model 3
Use only cytopus genes

In [None]:
model_cp = make_model(model_ver = 3,X = indatact_45_cp, labels = labelsct_45_cp, genes = genesct_45_cp, write_loc = 'New Models/CT_45 Models/ct_model_3')

In [None]:
model_98_cp = make_model(model_ver = 3, X = indatact_98_cp, labels = labelsct_98_cp, genes = genesct_98_cp,  write_loc = 'New Models/CT_98 Models/98_model_3')

In [None]:
model_cp_COV = make_model(model_ver = 3, X = indata_COV_cp, labels = labels_COV_cp, genes = genes_COV_cp, write_loc = 'New Models/COV_PBMC Models/COV_model_3')

In [None]:
model_cp_g = make_model(model_ver = 3, X = indata_glas_cp, labels = labels_glas_cp, genes = genes_glas_cp, write_loc = 'New Models/Glasner Models/g_model_3')

### Model 4
Confirm cytopus genes are included in feature selection

In [None]:
model_ct_fs = make_model(model_ver = 4, X = indatact_45, labels = labelsct_45, genes = genesct_45, check_expression = True, cyto_genes = cp_and_ct_genes_45, write_loc = 'New Models/CT_45 Models/ct_model_4')

In [None]:
model_cp_fs_98 = make_model(model_ver = 4, X = indatact_98, labels = labelsct_98, genes = genesct_98, cyto_genes = cp_and_ct_genes_98, write_loc = 'New Models/CT_98 Models/98_model_4')

In [None]:
model_cp_fs_COV = make_model(model_ver = 4, X = indata_COV, labels = labels_COV, genes = genes_COV, cyto_genes = cp_and_ct_genes_COV, write_loc = 'New Models/COV_PBMC Models/COV_model_4')

In [None]:
model_cp_fs_glas = make_model(model_ver = 4, X = indata_glas, labels = labels_glas, genes = genes_glas, cyto_genes = cp_and_ct_genes_glas, write_loc = 'New Models/Glasner Models/g_model_4')

### Model 5

In [None]:
model_ct_combo = make_model(model_ver = 5, X = indatact_45, labels = labelsct_45, genes = genesct_45, check_expression = True, cyto_genes = cp_and_ct_genes_45, write_loc = 'New Models/CT_45 Models/ct_model_5')

In [None]:
model_98_combo = make_model(model_ver = 5, X = indatact_98, labels = labelsct_98, genes = genesct_98, cyto_genes = cp_and_ct_genes_98, write_loc = 'New Models/CT_98 Models/98_model_5')

In [None]:
model_COV_combo = make_model(model_ver = 5, X = indata_COV, labels = labels_COV, genes = genes_COV, cyto_genes = cp_and_ct_genes_COV, write_loc = 'New Models/COV_PBMC Models/COV_model_5')

In [None]:
model_glas_combo = make_model(model_ver = 5, X = indata_glas, labels = labels_glas, genes = genes_glas, cyto_genes = cp_and_ct_genes_glas, write_loc = 'New Models/Glasner Models/g_model_5')