In [29]:
# mamba install -y mallet
# pip install -U scmallet

import scmallet

scmallet.__version__

'0.0.4.dev1+gaa90db5'

In [30]:
!mallet --version

Unrecognized command: --version
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into te

In [2]:
from scmallet import Mallet, binarize_topics

In [3]:
mallet = Mallet(output_dir="cistopic")

In [4]:
import anndata
import numpy as np

In [5]:
X = np.random.randint(0, 2, size=(100, 10000))
adata = anndata.AnnData(X=X)

## Fit

In [6]:
mallet.fit(
    num_topics=[5, 10],
    data=adata,  # can also provide a adata path or sparse matrix, will be binarized internally
    cpu_per_task=8,
    mem_gb=16,  # not really MEM intensive
    # other arguments goes into mallet.train
    iterations=300,
)

2024-04-11 01:30:25,216	INFO worker.py:1752 -- Started a local Ray instance.


### Re-fit with more iterations, or resume from interuption

In [8]:
# once the mallet model has been fit, it can be re-fit with more iterations
mallet.fit(
    num_topics=[5, 10],
    data=adata,
    cpu_per_task=8,
    mem_gb=16,  # not really MEM intensive
    # other arguments goes into mallet.train
    iterations=500,
)

### Note that if the model has been trained with enough iterations, no more fit will be done

In [9]:
# this will not trigger re-fit

mallet.fit(
    num_topics=[5, 10],
    data=adata,
    cpu_per_task=8,
    mem_gb=16,
    iterations=400,
)

## Topics

In [10]:
num_topics = 5

cell_topics = mallet.get_cell_topics(num_topics)
region_topics = mallet.get_region_topics(num_topics)

print("cell_topics", cell_topics.shape)
print("region_topics", region_topics.shape)

cell_topics (100, 5)
region_topics (10000, 5)


## Inference In Parallel

In [12]:
X = np.random.randint(0, 2, size=(1000, 10000))
large_adata = anndata.AnnData(X=X)
large_adata

AnnData object with n_obs × n_vars = 1000 × 10000

In [13]:
# once the mallet model has been fit, it can be re-fit with more iterations
mallet.infer_adata(large_adata)

In [18]:
# manually add region topics if needed
for num_topic in mallet.trained_num_topics:
    large_adata.varm[f"cistopic.topic{num_topic}"] = mallet.get_region_topics(num_topic)

In [19]:
large_adata

AnnData object with n_obs × n_vars = 1000 × 10000
    obsm: 'cistopic.topic10', 'cistopic.topic5'
    varm: 'cistopic.topic10', 'cistopic.topic5'

## Binarize topics

In [28]:
import pandas as pd

cell_topic_df = pd.DataFrame(large_adata.obsm["cistopic.topic10"], index=large_adata.obs_names)
cell_binary_topics = binarize_topics(cell_topic_df)
cell_binary_topics.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,True,False,False,True,True,True,False,False,True,True
1,False,True,True,True,True,False,False,False,False,False
2,False,True,True,False,True,False,False,True,True,False
3,True,False,True,False,True,False,False,True,False,True
4,False,False,True,False,True,True,True,True,False,True
