Tutorial from https://chanzuckerberg.github.io/cellxgene-census/notebooks/experimental/pytorch.html

In [50]:
import cellxgene_census
import cellxgene_census.experimental.ml as census_ml
import tiledbsoma as soma
import torch

import pandas as pd

import os
import pickle



In [2]:
cellxgene_census.__version__

'1.11.1'

## Build the Experiment and the DataPipe

In [25]:
# gene and cell type info stored on Turbo
os.chdir('/nfs/turbo/umms-welchjd/mccell')

First, let's load the gene list and cell type list that we want from the Census. Then we construct the ```var_val_filter``` and ```obs_val_filter``` for querying the census.

In [53]:
# load gene list
biomart = pd.read_csv('mart_export.txt')

coding_only = biomart[biomart['Gene type'] == 'protein_coding']

gene_list = coding_only['Gene stable ID'].to_list()

var_val_filter = '''feature_id in {}'''.format(gene_list)

# load the cell type list
cell_type_list_name = 'cell_type_list.txt'
with open(cell_type_list_name,'rb') as fp:
    cell_type_list = pickle.load(fp)

obs_val_filter = '''assay == "10x 3\' v3" and is_primary_data == True and cell_type_ontology_term_id in {}'''.format(cell_type_list_name)


In [55]:
#organism = "Homo sapiens"
col_names = {"obs": ["cell_type_ontology_term_id"]}


In [56]:
experiment = census["census_data"]["homo_sapiens"]


In [58]:
experiment_datapipe = census_ml.ExperimentDataPipe(
    experiment,
    measurement_name="RNA",
    X_name="raw",
    obs_query=soma.AxisQuery(value_filter=obs_val_filter),
    var_query=soma.AxisQuery(value_filter=var_val_filter),
    obs_column_names=["cell_type_ontology_term_id"],
    batch_size=128,
    shuffle=True,
    soma_chunk_size=10_000,
)


In [59]:
experiment_datapipe.shape

SOMAError: SOMAError: `in` operator syntax must be written as `attr in ['l', 'i', 's', 't']`

At:
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/tiledbsoma/_query_condition.py(141): init_query_condition
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/tiledbsoma/_dataframe.py(359): read
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/somacore/query/query.py(731): _load_joinids
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/somacore/query/query.py(707): obs
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/somacore/query/query.py(167): obs_joinids
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/cellxgene_census/experimental/ml/pytorch.py(494): _init
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/cellxgene_census/experimental/ml/pytorch.py(651): shape
  /tmp/ipykernel_2529267/1568377421.py(1): <module>
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3508): run_code
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3448): run_ast_nodes
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3269): run_cell_async
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/async_helpers.py(129): _pseudo_sync_runner
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3064): _run_cell
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3009): run_cell
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/zmqshell.py(540): run_cell
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/ipkernel.py(422): do_execute
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/kernelbase.py(729): execute_request
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/kernelbase.py(409): dispatch_shell
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/kernelbase.py(502): process_one
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/kernelbase.py(513): dispatch_queue
  /sw/pkgs/arc/python3.10-anaconda/2023.03/lib/python3.10/asyncio/events.py(80): _run
  /sw/pkgs/arc/python3.10-anaconda/2023.03/lib/python3.10/asyncio/base_events.py(1906): _run_once
  /sw/pkgs/arc/python3.10-anaconda/2023.03/lib/python3.10/asyncio/base_events.py(603): run_forever
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/tornado/platform/asyncio.py(195): start
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel/kernelapp.py(725): start
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/traitlets/config/application.py(1043): launch_instance
  /home/fujoshua/cell_classification/env/lib/python3.10/site-packages/ipykernel_launcher.py(17): <module>
  /sw/pkgs/arc/python3.10-anaconda/2023.03/lib/python3.10/runpy.py(86): _run_code
  /sw/pkgs/arc/python3.10-anaconda/2023.03/lib/python3.10/runpy.py(196): _run_module_as_main


In [9]:
experiment = census["census_data"]["homo_sapiens"]


In [10]:
experiment_datapipe = census_ml.ExperimentDataPipe(
    experiment,
    measurement_name="RNA",
    X_name="raw",
    obs_query=soma.AxisQuery(value_filter="tissue_general == 'tongue' and is_primary_data == True"),
    obs_column_names=["cell_type"],
    batch_size=128,
    shuffle=True,
    soma_chunk_size=10_000,
)


In [11]:
experiment_datapipe.shape

(15020, 60664)

In [12]:
train_datapipe, test_datapipe = experiment_datapipe.random_split(weights={"train": 0.8, "test": 0.2}, seed=1)


In [13]:
experiment_dataloader = census_ml.experiment_dataloader(train_datapipe)


In [15]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()  # noqa: UP008
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs


In [16]:
def train_epoch(model, train_dataloader, loss_fn, optimizer, device):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        X_batch, y_batch = batch

        X_batch = X_batch.float().to(device)

        # Perform prediction
        outputs = model(X_batch)

        # Determine the predicted label
        probabilities = torch.nn.functional.softmax(outputs, 1)
        predictions = torch.argmax(probabilities, axis=1)

        # Compute the loss and perform back propagation

        # Exclude the cell_type labels, which are in the second column
        y_batch = y_batch[:, 1]
        y_batch = y_batch.to(device)

        train_correct += (predictions == y_batch).sum().item()
        train_total += len(predictions)

        loss = loss_fn(outputs, y_batch.long())
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_loss /= train_total
    train_accuracy = train_correct / train_total
    return train_loss, train_accuracy


In [18]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# The size of the input dimension is the number of genes
input_dim = experiment_datapipe.shape[1]

# The size of the output dimension is the number of distinct cell_type values
cell_type_encoder = experiment_datapipe.obs_encoders["cell_type"]
output_dim = len(cell_type_encoder.classes_)

model = LogisticRegression(input_dim, output_dim).to(device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-05)

for epoch in range(3):
    train_loss, train_accuracy = train_epoch(model, experiment_dataloader, loss_fn, optimizer, device)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.7f} Accuracy {train_accuracy:.4f}")


Epoch 1: Train Loss: 0.0185056 Accuracy 0.2145
Epoch 2: Train Loss: 0.0162072 Accuracy 0.2665
Epoch 3: Train Loss: 0.0149051 Accuracy 0.3136


In [19]:
experiment_dataloader = census_ml.experiment_dataloader(test_datapipe)
X_batch, y_batch = next(iter(experiment_dataloader))


In [20]:
model.eval()

model.to(device)
outputs = model(X_batch.to(device))

probabilities = torch.nn.functional.softmax(outputs, 1)
predictions = torch.argmax(probabilities, axis=1)

display(predictions)


tensor([1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1,
        7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 7, 1, 1, 1, 1, 1, 7,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 7, 7, 1, 1, 1, 1, 1])

In [3]:
census = cellxgene_census.open_soma(uri = "/scratch/welchjd_root/welchjd99/fujoshua/soma")

In [4]:
with cellxgene_census.open_soma(uri = "/scratch/welchjd_root/welchjd99/fujoshua/soma") as census:
    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
        column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
    )


In [7]:
cell_metadata

<tiledbsoma._read_iters.TableReadIter at 0x152b7f2edff0>

In [None]:
census

In [5]:
census["census_data"]["homo_sapiens"]

<Experiment 'file:///scratch/welchjd_root/welchjd99/fujoshua/soma/census_data/homo_sapiens' (CLOSED for 'r')>

In [6]:
census["census_data"]["homo_sapiens"].obs.read(
        value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
        column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
    )

TileDBError: [TileDB::Array] Error: Cannot get array schema; Array is not open