# Tokenization

In [1]:
from stFormer.tokenization.median_estimator import MedianEstimator

## 1.1 Create Gene Medians and T-Digests
1. Compute T-Digest: compact summary of full distribution of normalized expression data
2. Compute Gene Medians (Measure of Central Tendency): Summary of typical expression for each gene
3. Write files to output directory

In [None]:
# Create MedianEstimator Object
estimator = MedianEstimator(
    data_dir = 'data', # directory where anndata or loom files are housed for tokenization/model creation
    extension = '.h5ad', # type of file Literal['.h5ad','.loom']
    out_path = 'output', # output directory where files are written
    merge_tdigests = True # option to merge multiple samples gene distributions (set False if only one dataset)
)

In [None]:
estimator.compute_tdigests() # worker for computing tdigests (merges distribuitons if more than one dataset)
estimator.tdigests # show tdigests

In [None]:
estimator.get_median_dict() # worker for summarizing gene expression 
estimator.median

In [None]:
estimator.write_tdigests() # write to out_path
estimator.write_medians() # write to out_path

## 1.2 Tokenize Spot Level Resolution (55um)
This option will take in each cells (spot) gene expression and tokenize dataset through 
1. Select non-zero genes
2. normalize counts
3. maps genes -> tokens
4. create ranks for genes
5. truncates sequence length (tokens) to set length (ex: 2048 tokens aka: "genes")
6. Bundles into Hugging Face Dataset optionally augmented with cell metadata (e.g., sample_ID, coordinates, etc)

In [None]:
from stFormer.tokenization.SpatialTokenize import SpatialTokenizer, create_token_dictionary
from pathlib import Path

In [None]:
#create token dictionary
token_dictionary = create_token_dictionary(median_dict=median_dict)
with open('output/token_dictionary.pickle','wb') as file:
    pickle.dump(token_dictionary,file)

In [None]:
tok_spot = SpatialTokenizer(
    mode='spot', #mode is spot/resolution 55um
    gene_length=2048, # truncate sequence length to 2048 tokens per cell
    custom_meta = {'patient_id':'sample','classification':'classification','subtype':'subtype'},
    nproc=16, #number of CPUs for data I/O
    gene_median_file='output/gene_medians.pickle', # location to gene_medians
    token_dict_file='output/token_dictionary.pickle', # location of token dictionary
    )

tok_spot.tokenize(
    data_dir=Path('data'), # location of h5ad/loom data
    out_dir=Path('output/spot'), # where to write tokenized data to
    prefix='visium_spot', # prefix of files output
    file_format='h5ad' # what the file format is Literal['h5ad','loom']
    )

## 1.3 Tokenize Neighbor Level Resolution
Performs the same functions as spot level resolution but with key changes to theoretically model both spot level expression as well as neighborhood level expression (consider this as the niche/neighborhood tokenization)
1. Calculates gene ranks for spot
2. Creates connectivities matrix to locate neighbors based upon coordinate information
3. Calculates gene ranks for each neighboring spot and averages ranks
4. Concatenates truncated tokens (spot) + truncated tokens (neighborhood)


**Note:** gene_length = 2048 means that both spot and neighbor tokens are truncated to 2048, so the final embedding size should be 4097

In [None]:
tok_neighbor = SpatialTokenizer(
    mode='neighborhood',
    gene_length=2048,
    custom_meta = {'patient_id':'sample','classification':'classification','subtype':'subtype'},
    nproc=16,
    gene_median_file='output/gene_medians.pickle',
    token_dict_file='output/token_dictionary.pickle'
    )

tok_neighbor.tokenize(
    data_dir=Path('data'),
    out_dir=Path('output/neighborhood'),
    prefix='visium_neighborhood',
    file_format='h5ad')

## 1.4 Visualize Embeddings

from datasets import load_from_disk
import pandas as pd
import numpy as np

In [None]:
dss = load_from_disk('output/spot/visium_spot.dataset')
dsn = load_from_disk('output/neighborhood/visium_neighborhood.dataset')
dfs = dss.to_pandas()
dfn = dsn.to_pandas()
dfn