# GenePT and scGPT cell classification performance on Tabula Sapiens

This notebook downloads (if necessary) the [Tabula Sapiens data set](https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5)
and uses GenePT and scGPT pretrained embeddings to embed the cells and then tests classification performance.  Tabula Sapiens is a benchmark dataset, so the models we train cannot be used for real-world applications. Rather, we are training these classifiers to benchmark our GenePT embeddings and pretrained scGPT embeddings on a large dataset with a high number of cell types.


In [1]:
%run notebook_setup.ipynb

autoreload enabled
repo_dir set to /Users/rj/personal/GenePT-tools
File already exists at /Users/rj/personal/GenePT-tools/data/GenePT_emebdding_v2.zip
Extracting files...
Extracting GenePT_emebdding_v2/
Skipping GenePT_emebdding_v2/NCBI_UniProt_summary_of_genes.json - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_embedding_ada_text.pickle - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_protein_embedding_model_3_text.pickle. - already exists with same size
Skipping GenePT_emebdding_v2/NCBI_summary_of_genes.json - already exists with same size
Extraction complete!
Skipping embedding_original_ada_text.parquet - already exists
Skipping embedding_original_large_3.parquet - already exists
Skipping embedding_associations_age_cell_type_drugs_pathways_openai_large.parquet - already exists
Skipping embedding_associations_age_drugs_pathways_openai_large.parquet - already exists
Skipping embedding_associations_cell_type_openai_large.parquet - alrea

In [2]:
import pandas as pd

embedded_100k_ts_files = {
    name: data_dir / f"{name}.parquet"
    for name in (
        "tabula_sapiens_100k_scgpt_embedding",
        "tabula_sapiens_100k_genept_embedding_original_ada_text",
        "tabula_sapiens_100k_genept_embedding_original_large_3",
        "tabula_sapiens_100k_genept_embedding",
        "tabula_sapiens_100k_genept_embedding_v2",
        "tabula_sapiens_100k_genept_embedding_v3",
        "tabula_sapiens_100k_genept_embedding_cell_type_tissue_drug_pathway_openai_large"
    )
}


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import umap
from sklearn.decomposition import PCA


def umap_embed(embed_pdf, n_samples=10000):
    # Then apply UMAP to the PCA results
    reducer = umap.UMAP(random_state=42)
    np.random.seed(42)
    random_indices = np.random.choice(embed_pdf.shape[0], size=n_samples, replace=False)
    umap_sample_pdf = embed_pdf.iloc[random_indices].drop(
        columns=["cell_type", "donor_id", "broad_cell_class"]
    )
    umap_embeddings = reducer.fit_transform(umap_sample_pdf)

    umap_df = pd.DataFrame(
        umap_embeddings, columns=["UMAP1", "UMAP2"], index=umap_sample_pdf.index
    ).merge(embed_pdf, left_index=True, right_index=True)
    return umap_df


umap_embeddings = {}

for name, file in embedded_100k_ts_files.items():
    embed_pdf = pd.read_parquet(file)
    umap_embeddings[name] = umap_embed(embed_pdf)
    del embed_pdf


  from .autonotebook import tqdm as notebook_tqdm
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [4]:
for name, embed_pdf in umap_embeddings.items():
    # Create the plot
    fig = px.scatter(
        embed_pdf,
        x="UMAP1",
        y="UMAP2",
        color="cell_type",
        opacity=0.7,
        title=name,
    )

    # Update layout to make the data area square
    fig.update_layout(
        title={"y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top"},
        width=1400,  # Wider to accommodate legend
        height=800,
    )

    # Make the plot area square by adjusting margins
    fig.update_layout(
        margin=dict(r=200),  # Add right margin for legend
        xaxis=dict(domain=[0, 0.8]),  # Restrict plot area width to make it square
        yaxis=dict(scaleanchor="x", scaleratio=1),
    )

    fig.show()

In [5]:
embed_scgpt_pdf = pd.read_parquet(embedded_100k_ts_files["tabula_sapiens_100k_scgpt_embedding"])

In [6]:
import plotly.express as px

px.histogram(embed_scgpt_pdf.broad_cell_class.sort_values())

In [7]:
# Create a cross-tabulation of donor_id and cell_type
heatmap_data = pd.crosstab(embed_scgpt_pdf.donor_id, embed_scgpt_pdf.broad_cell_class)

# Create heatmap using plotly
import numpy as np
import plotly.express as px

# Apply log10 transform to the data (adding 1 to avoid log(0))
log_data = np.log10(heatmap_data.values + 1)

# Create regular heatmap with log-transformed data
fig = px.imshow(
    log_data,
    labels=dict(x="Cell Type", y="Donor ID", color="Count"),
    x=heatmap_data.columns,
    y=heatmap_data.index,
    color_continuous_scale="Viridis",
    title="Cell Type Distribution Across Donors (Log Scale)",
    aspect="auto",
)

# Update hover template to show both log and linear values
fig.data[0].customdata = heatmap_data.values
fig.data[0].hovertemplate = (
    "Cell Type: %{x}<br>Donor ID: %{y}<br>Count: %{customdata:.0f}<br>Log10 Count: %{z:.2f}<extra></extra>"
)

# Create tick values for the colorbar (in log space)
tick_values = np.linspace(log_data.min(), log_data.max(), 6)
# Convert tick values back to linear space for labels
tick_labels = [f"{int(10**x - 1)}" for x in tick_values]

# Update layout and colorbar
fig.update_layout(
    xaxis_title="Cell Type",
    yaxis_title="Donor ID",
    height=700,  # Adjusted height (increase as needed)
    coloraxis=dict(
        colorbar=dict(title="Count", tickvals=tick_values, ticktext=tick_labels)
    ),
)

fig.show()

In [8]:
def create_cell_type_groups(df, min_samples=600):
    """Create grouped cell types, combining rare types into 'other'

    Args:
        df: DataFrame containing 'broad_cell_class' column
        min_samples: Minimum number of samples required to keep a category

    Returns:
        Series with grouped cell types
    """
    # Get value counts and identify small categories
    category_counts = pd.Series(df.broad_cell_class.value_counts())
    small_categories = category_counts[category_counts < min_samples].index

    # Get existing categories and add 'other'
    existing_categories = df.broad_cell_class.cat.categories
    new_categories = pd.Index(existing_categories).append(pd.Index(["other"]))

    # Create new column with expanded categories
    cell_type_grouped = df.broad_cell_class.astype(str).astype("category")
    cell_type_grouped = cell_type_grouped.cat.set_categories(new_categories)

    # Assign the 'other' category
    cell_type_grouped.loc[df.broad_cell_class.isin(small_categories)] = "other"

    return cell_type_grouped


# Apply the function to both dataframes
embed_scgpt_pdf["cell_type_grouped"] = create_cell_type_groups(embed_scgpt_pdf)
# embed_genept_pdf["cell_type_grouped"] = create_cell_type_groups(embed_genept_pdf)

In [9]:
embed_scgpt_pdf.shape

(100000, 516)

In [10]:

embed_scgpt_pdf.cell_type_grouped.value_counts()


cell_type_grouped
t cell                             14053
stromal cell                       13013
myeloid leukocyte                   8565
lymphocyte of b lineage             8499
contractile cell                    7916
fibroblast                          6995
endothelial cell                    6019
stem cell                           5937
granulocyte                         5797
intestinal epithelial cell          5764
transitional epithelial cell        5384
other                               3180
innate lymphoid cell                2507
glandular epithelial cell           1988
epithelial cell                     1810
cardiac endothelial cell            1088
epithelial cell of lung              819
endo-epithelial cell                 666
ecto-epithelial cell                   0
ciliated epithelial cell               0
conjunctival epithelial cell           0
dendritic cell                         0
stratified epithelial cell             0
duct epithelial cell                   

In [11]:
train_test_counts =pd.merge(
    embed_scgpt_pdf.cell_type_grouped.value_counts(),
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id == "TSP1"].cell_type_grouped.value_counts().rename("TSP1"),
    how="outer",
    left_index=True,
    right_index=True,
).merge(
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id != "TSP1"].cell_type_grouped.value_counts().rename("not_TSP1"),
    how="outer",
    left_index=True,
    right_index=True,
).merge(
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id == "TSP2"].cell_type_grouped.value_counts().rename("TSP2"),
    how="outer",
    left_index=True,
    right_index=True,
).merge(
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id != "TSP2"].cell_type_grouped.value_counts().rename("not_TSP2"),
    how="outer",
    left_index=True,
    right_index=True,
).merge(
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id == "TSP14"].cell_type_grouped.value_counts().rename("TSP14"),
    how="outer",
    left_index=True,
    right_index=True,
).merge(
    embed_scgpt_pdf[embed_scgpt_pdf.donor_id != "TSP14"].cell_type_grouped.value_counts().rename("not_TSP14"),
    how="outer",
    left_index=True,
    right_index=True,
)

train_test_counts

Unnamed: 0_level_0,count,TSP1,not_TSP1,TSP2,not_TSP2,TSP14,not_TSP14
cell_type_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
adventitial cell,0,0,0,0,0,0,0
cardiac endothelial cell,1088,0,1088,0,1088,57,1031
ciliated epithelial cell,0,0,0,0,0,0,0
conjunctival epithelial cell,0,0,0,0,0,0,0
contractile cell,7916,243,7673,884,7032,633,7283
dendritic cell,0,0,0,0,0,0,0
duct epithelial cell,0,0,0,0,0,0,0
ecto-epithelial cell,0,0,0,0,0,0,0
endo-epithelial cell,666,6,660,101,565,24,642
endothelial cell,6019,559,5460,2783,3236,1296,4723


In [12]:
# X = pd.DataFrame(ref_embed_adata.obsm["X_scGPT"])
# y = ref_embed_adata.obs["broad_cell_class"]
# X["donor_id"] = ref_embed_adata.obs.donor_id.cat.codes.to_numpy()

# # print("Shape of embedding features indicator:", embedding_features_indicator.shape)
# print("Shape of filtered features matrix:", X.shape)

In [13]:
# y == "t cell"

In [14]:
# def get_mask_for_label_excluding_donor(y, label, test_donor):
#     return (y == label) & (X.donor_id != test_donor)


# y = embed_scgpt_pdf.broad_cell_class
# test_donor = "TSP14"
# label = "endo-epithelial cell"
# mask = get_mask_for_label_excluding_donor(y, label, test_donor)
# y.index[mask]
# # embed_scgpt_pdf[mask]
# sample_count = mask.sum()
# n_samples = min(1000, sample_count)

# sampled_indices = pd.Index(
#     np.random.choice(pd.Series(y[mask].index), size=n_samples, replace=False)
# )

In [15]:
# from sklearn.model_selection import GroupShuffleSplit

# # Create group-wise split
# gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
# train_idx, test_idx = next(gss.split(X, y, groups=X.donor_id))

# # Split the data using the indices
# X_train = X.drop(columns=['donor_id']).iloc[train_idx]
# X_test = X.drop(columns=['donor_id']).iloc[test_idx]
# y_train = y.iloc[train_idx]
# y_test = y.iloc[test_idx]

In [16]:
# (y == "t cell").index

In [17]:
# (X.donor_id != test_donor).index

In [18]:
# del combined_embedding_pdf

In [19]:
# for comparison of different embeddings we don't need to look at the combined embeddings

# embed_scgpt_pdf.index = embed_genept_pdf.index
# combined_embedding_pdf = embed_scgpt_pdf.drop(
#     columns=["donor_id", "cell_type", "broad_cell_class", "cell_type_grouped"]
# ).merge(embed_genept_pdf, left_index=True, right_index=True)
# combined_embedding_pdf.shape

In [20]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# Define the donors we want to evaluate
test_donors = ["TSP1", "TSP2", "TSP14"]
results = []


def get_mask_for_label_excluding_donor(y, label, test_donor):
    return (y == label) & (X.donor_id != test_donor)


# for embed_pdf, embed_name in zip(
#     [combined_embedding_pdf, embed_scgpt_pdf, embed_genept_pdf],
#     ["combined", "scGPT", "GenePT"],
# ):

for embed_name, file in embedded_100k_ts_files.items():
    print("Processing embedding:", embed_name)
    embed_pdf = pd.read_parquet(file)

    X = embed_pdf.drop(columns=["cell_type", "broad_cell_class"])
    y = create_cell_type_groups(embed_pdf)

    # Perform cross-validation, holding out one donor at a time
    for test_donor in test_donors:
        print(f"\n=== Cross Validation Fold: Testing on Donor {test_donor} ===")

        # Create initial train/test split based on donor
        train_mask = X.donor_id != test_donor
        test_indices = X[~train_mask].index

        # Subsample training data to get 200 samples per cell type
        train_indices = []
        clipped_train_indices = []
        for class_label in y.unique():

            mask = get_mask_for_label_excluding_donor(y, class_label, test_donor)
            sample_count = mask.sum()
            print(f"{class_label}: {sample_count}/{(y == class_label).sum()}")

            # Randomly sample up to 1000 indices
            if len(mask) > 0:
                train_indices.extend(y[mask].index)

                n_samples = min(1000, sample_count)
                np.random.seed(42)
                sampled_indices = pd.Index(
                    np.random.choice(
                        pd.Series(y[mask].index), size=n_samples, replace=False
                    )
                )
                clipped_train_indices.extend(sampled_indices)
            else:
                print(f"warning: class '{class_label}' has no samples!")

        # Create the final train/test splits
        X_train = X.drop(columns=["donor_id"]).iloc[train_indices]
        X_train_clipped = X.drop(columns=["donor_id"]).iloc[clipped_train_indices]
        X_test = X.drop(columns=["donor_id"]).iloc[test_indices]
        y_train = y.iloc[train_indices]
        y_train_clipped = y.iloc[clipped_train_indices]
        y_test = y.iloc[test_indices]

        print(y_train.value_counts().sort_index())
        print(y_test.value_counts().sort_index())

        print(f"Training set size: {len(X_train)}")
        print(f"Clipped Training set size: {len(X_train_clipped)}")
        print(f"Test set size: {len(X_test)}")
        print("\nTraining class distribution:")
        print(y_train.value_counts().sort_index())

        # Train and evaluate models
        models = {
            # "KNN": KNeighborsClassifier(n_neighbors=10),
            # "Random Forest": RandomForestClassifier(random_state=42),
            "LightGBM": LGBMClassifier(random_state=42, class_weight="balanced"),
        }

        for name, model in models.items():
            print(f"\n{name} Results:")
            print("-" * 50)
            if name == "Random Forest":
                model.fit(X_train, y_train)
            else:
                model.fit(X_train_clipped, y_train_clipped)
            y_pred = model.predict(X_test)

            valid_classes = sorted(set(y_test))
            # Generate report only for classes that exist in the data
            report = classification_report(
                y_test,
                y_pred,
                # labels=valid_classes,
                zero_division=0,
                output_dict=True,
            )
            # Store results
            results.append(
                {
                    "embed_name": embed_name,
                    "test_donor": test_donor,
                    "model": name,
                    # 'accuracy': report['accuracy'],
                    "macro_avg_f1": report["macro avg"]["f1-score"],
                    "weighted_avg_f1": report["weighted avg"]["f1-score"],
                    "train_size": len(X_train),
                    "test_size": len(X_test),
                    "report": report,
                }
            )

            print(classification_report(y_test, y_pred))

    del embed_pdf


# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df.round(3))

Processing embedding: tabula_sapiens_100k_scgpt_embedding

=== Cross Validation Fold: Testing on Donor TSP1 ===
t cell: 13636/14053
lymphocyte of b lineage: 8332/8499
innate lymphoid cell: 2498/2507
endothelial cell: 5460/6019
other: 2693/3180
contractile cell: 7673/7916
granulocyte: 5651/5797
myeloid leukocyte: 8195/8565
cardiac endothelial cell: 1088/1088
glandular epithelial cell: 518/1988
epithelial cell: 1800/1810
epithelial cell of lung: 722/819
stem cell: 5500/5937
stromal cell: 12969/13013
fibroblast: 6762/6995
endo-epithelial cell: 660/666
intestinal epithelial cell: 5764/5764
transitional epithelial cell: 5228/5384
broad_cell_class
adventitial cell                       0
cardiac endothelial cell            1088
ciliated epithelial cell               0
conjunctival epithelial cell           0
contractile cell                    7673
dendritic cell                         0
duct epithelial cell                   0
ecto-epithelial cell                   0
endo-epithelial cell  


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.53      0.56      0.55       243
        endo-epithelial cell       0.04      0.83      0.08         6
            endothelial cell       0.93      0.84      0.88       559
             epithelial cell       0.25      0.70      0.37        10
     epithelial cell of lung       0.99      0.87      0.92        97
                  fibroblast       0.37      0.62      0.47       233
   glandular epithelial cell       0.55      0.14      0.22      1470
                 granulocyte       0.87      0.84      0.85       146
        innate lymphoid cell       0.06      0.67      0.10         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.99      0.90      0.94       167
           myeloid leukocyte       0.83      0.80      0.82       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.72      0.85      0.78       884
        endo-epithelial cell       0.05      0.05      0.05       101
            endothelial cell       0.92      0.71      0.80      2783
             epithelial cell       0.87      0.61      0.72       234
     epithelial cell of lung       0.93      0.99      0.96       246
                  fibroblast       0.19      0.79      0.31       811
   glandular epithelial cell       0.42      0.75      0.54        51
                 granulocyte       0.88      0.90      0.89       583
        innate lymphoid cell       0.42      0.89      0.57       560
  intestinal epithelial cell       0.88      0.91      0.89      2201
     lymphocyte of b lineage       0.99      0.98      0.99      2563
           myeloid leukocyte       0.96      0.90      0.93      2219
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.65      0.44      0.52       243
        endo-epithelial cell       0.00      0.17      0.01         6
            endothelial cell       0.89      0.85      0.87       559
             epithelial cell       0.13      0.70      0.23        10
     epithelial cell of lung       0.96      0.77      0.86        97
                  fibroblast       0.42      0.69      0.52       233
   glandular epithelial cell       0.74      0.59      0.66      1470
                 granulocyte       0.75      0.72      0.73       146
        innate lymphoid cell       0.05      0.78      0.09         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.93      0.89      0.91       167
           myeloid leukocyte       0.70      0.77      0.73       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.64      0.82      0.72       884
        endo-epithelial cell       0.00      0.00      0.00       101
            endothelial cell       0.89      0.65      0.75      2783
             epithelial cell       0.49      0.51      0.50       234
     epithelial cell of lung       0.97      0.97      0.97       246
                  fibroblast       0.22      0.76      0.34       811
   glandular epithelial cell       0.30      0.37      0.33        51
                 granulocyte       0.84      0.84      0.84       583
        innate lymphoid cell       0.33      0.84      0.47       560
  intestinal epithelial cell       0.83      0.85      0.84      2201
     lymphocyte of b lineage       0.95      0.95      0.95      2563
           myeloid leukocyte       0.94      0.88      0.91      2219
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.65      0.45      0.53       243
        endo-epithelial cell       0.09      0.17      0.12         6
            endothelial cell       0.88      0.87      0.88       559
             epithelial cell       0.12      0.70      0.21        10
     epithelial cell of lung       0.94      0.79      0.86        97
                  fibroblast       0.43      0.71      0.54       233
   glandular epithelial cell       0.79      0.84      0.82      1470
                 granulocyte       0.88      0.78      0.83       146
        innate lymphoid cell       0.06      0.78      0.11         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.97      0.89      0.93       167
           myeloid leukocyte       0.78      0.83      0.80       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.56      0.83      0.67       884
        endo-epithelial cell       0.00      0.01      0.01       101
            endothelial cell       0.90      0.66      0.76      2783
             epithelial cell       0.58      0.53      0.55       234
     epithelial cell of lung       0.96      0.97      0.97       246
                  fibroblast       0.22      0.76      0.35       811
   glandular epithelial cell       0.47      0.57      0.51        51
                 granulocyte       0.86      0.87      0.86       583
        innate lymphoid cell       0.35      0.85      0.50       560
  intestinal epithelial cell       0.82      0.87      0.85      2201
     lymphocyte of b lineage       0.98      0.96      0.97      2563
           myeloid leukocyte       0.96      0.90      0.93      2219
                   

Exception ignored on calling ctypes callback function: <function _log_callback at 0x327f2c5e0>
Traceback (most recent call last):
  File "/Users/rj/personal/GenePT-tools/venv3.10/lib/python3.10/site-packages/lightgbm/basic.py", line 257, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
                              precision    recall  f1-score   support

    cardiac endothelial cell       0.07      0.12      0.09        57
            contractile cell       0.84      0.67      0.75       633
        endo-epithelial cell       0.03      0.17      0.05        24
            endothelial cell       0.94      0.81      0.87      1296
             epithelial cell       0.52      0.91      0.66       473
     epithelial cell of lung       0.97      0.81      0.88       476
                  fibroblast       0.80      0.63      0.70      1300
   glandular epithelial cell       0.63      0.17      0.26       304
                 granulocyte       0.91      0.96      0.94      3840
        innate lymphoid cell       0.51      0.79      0.62       999
  intestinal epithelial cell       0.88      0.81      0.84      1244
     lymphocyte of b lineage       0.99      0.94      0.97      3362
           myeloid leukocyte       


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.64      0.46      0.53       243
        endo-epithelial cell       0.00      0.00      0.00         6
            endothelial cell       0.90      0.87      0.89       559
             epithelial cell       0.13      0.70      0.22        10
     epithelial cell of lung       0.94      0.85      0.89        97
                  fibroblast       0.45      0.69      0.55       233
   glandular epithelial cell       0.83      0.88      0.85      1470
                 granulocyte       0.87      0.82      0.84       146
        innate lymphoid cell       0.06      0.78      0.10         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.96      0.90      0.93       167
           myeloid leukocyte       0.79      0.82      0.80       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.58      0.83      0.68       884
        endo-epithelial cell       0.00      0.00      0.00       101
            endothelial cell       0.90      0.68      0.78      2783
             epithelial cell       0.63      0.55      0.59       234
     epithelial cell of lung       0.95      0.96      0.96       246
                  fibroblast       0.22      0.74      0.34       811
   glandular epithelial cell       0.49      0.69      0.57        51
                 granulocyte       0.86      0.88      0.87       583
        innate lymphoid cell       0.36      0.82      0.50       560
  intestinal epithelial cell       0.83      0.86      0.84      2201
     lymphocyte of b lineage       0.98      0.95      0.97      2563
           myeloid leukocyte       0.94      0.91      0.92      2219
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.68      0.48      0.56       243
        endo-epithelial cell       0.05      0.17      0.08         6
            endothelial cell       0.91      0.88      0.89       559
             epithelial cell       0.11      0.70      0.20        10
     epithelial cell of lung       0.94      0.79      0.86        97
                  fibroblast       0.43      0.73      0.54       233
   glandular epithelial cell       0.81      0.90      0.85      1470
                 granulocyte       0.87      0.81      0.84       146
        innate lymphoid cell       0.05      0.78      0.10         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.94      0.90      0.92       167
           myeloid leukocyte       0.79      0.80      0.79       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.63      0.85      0.72       884
        endo-epithelial cell       0.01      0.01      0.01       101
            endothelial cell       0.91      0.68      0.78      2783
             epithelial cell       0.63      0.56      0.59       234
     epithelial cell of lung       0.95      0.95      0.95       246
                  fibroblast       0.22      0.76      0.35       811
   glandular epithelial cell       0.48      0.63      0.54        51
                 granulocyte       0.87      0.89      0.88       583
        innate lymphoid cell       0.35      0.84      0.50       560
  intestinal epithelial cell       0.82      0.88      0.85      2201
     lymphocyte of b lineage       0.98      0.96      0.97      2563
           myeloid leukocyte       0.95      0.90      0.93      2219
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.65      0.46      0.54       243
        endo-epithelial cell       0.00      0.00      0.00         6
            endothelial cell       0.89      0.86      0.88       559
             epithelial cell       0.10      0.70      0.18        10
     epithelial cell of lung       0.94      0.81      0.87        97
                  fibroblast       0.45      0.68      0.54       233
   glandular epithelial cell       0.82      0.88      0.85      1470
                 granulocyte       0.81      0.82      0.81       146
        innate lymphoid cell       0.05      0.78      0.10         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.95      0.89      0.92       167
           myeloid leukocyte       0.79      0.80      0.80       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.63      0.83      0.71       884
        endo-epithelial cell       0.01      0.01      0.01       101
            endothelial cell       0.90      0.67      0.77      2783
             epithelial cell       0.63      0.53      0.58       234
     epithelial cell of lung       0.96      0.96      0.96       246
                  fibroblast       0.22      0.77      0.35       811
   glandular epithelial cell       0.42      0.57      0.48        51
                 granulocyte       0.86      0.89      0.88       583
        innate lymphoid cell       0.35      0.83      0.49       560
  intestinal epithelial cell       0.82      0.87      0.84      2201
     lymphocyte of b lineage       0.99      0.96      0.97      2563
           myeloid leukocyte       0.95      0.90      0.92      2219
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.67      0.48      0.56       243
        endo-epithelial cell       0.00      0.00      0.00         6
            endothelial cell       0.90      0.88      0.89       559
             epithelial cell       0.09      0.70      0.16        10
     epithelial cell of lung       0.95      0.84      0.89        97
                  fibroblast       0.44      0.73      0.55       233
   glandular epithelial cell       0.83      0.91      0.87      1470
                 granulocyte       0.87      0.85      0.86       146
        innate lymphoid cell       0.05      0.78      0.09         9
  intestinal epithelial cell       0.00      0.00      0.00         0
     lymphocyte of b lineage       0.96      0.90      0.93       167
           myeloid leukocyte       0.80      0.82      0.81       370
                   


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



                              precision    recall  f1-score   support

    cardiac endothelial cell       0.00      0.00      0.00         0
            contractile cell       0.63      0.85      0.72       884
        endo-epithelial cell       0.01      0.01      0.01       101
            endothelial cell       0.91      0.69      0.78      2783
             epithelial cell       0.60      0.55      0.57       234
     epithelial cell of lung       0.96      0.95      0.95       246
                  fibroblast       0.22      0.77      0.35       811
   glandular epithelial cell       0.45      0.59      0.51        51
                 granulocyte       0.84      0.90      0.87       583
        innate lymphoid cell       0.36      0.83      0.51       560
  intestinal epithelial cell       0.81      0.86      0.84      2201
     lymphocyte of b lineage       0.98      0.95      0.97      2563
           myeloid leukocyte       0.95      0.90      0.92      2219
                   

In [22]:
results_df.embed_name.unique()

array(['tabula_sapiens_100k_scgpt_embedding',
       'tabula_sapiens_100k_genept_embedding_original_ada_text',
       'tabula_sapiens_100k_genept_embedding_original_large_3',
       'tabula_sapiens_100k_genept_embedding',
       'tabula_sapiens_100k_genept_embedding_v2',
       'tabula_sapiens_100k_genept_embedding_v3',
       'tabula_sapiens_100k_genept_embedding_cell_type_tissue_drug_pathway_openai_large'],
      dtype=object)

In [None]:
results_df.report.iloc[0]["cardiac endothelial cell"]

{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0}

In [23]:
comparison_file_path = data_dir / "algorithm_comparison_v3.parquet"
results_df.to_parquet(comparison_file_path)

In [None]:
results_df.report.iloc[0]

{'cardiac endothelial cell': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 0.0},
 'contractile cell': {'precision': 0.528957528957529,
  'recall': 0.5637860082304527,
  'f1-score': 0.545816733067729,
  'support': 243.0},
 'endo-epithelial cell': {'precision': 0.03937007874015748,
  'recall': 0.8333333333333334,
  'f1-score': 0.07518796992481203,
  'support': 6.0},
 'endothelial cell': {'precision': 0.9268774703557312,
  'recall': 0.8389982110912343,
  'f1-score': 0.8807511737089202,
  'support': 559.0},
 'epithelial cell': {'precision': 0.25,
  'recall': 0.7,
  'f1-score': 0.3684210526315789,
  'support': 10.0},
 'epithelial cell of lung': {'precision': 0.9882352941176471,
  'recall': 0.865979381443299,
  'f1-score': 0.9230769230769231,
  'support': 97.0},
 'fibroblast': {'precision': 0.37467700258397935,
  'recall': 0.6223175965665236,
  'f1-score': 0.46774193548387094,
  'support': 233.0},
 'glandular epithelial cell': {'precision': 0.547945205479452,
  'recall'

# Reload results
So that we can skip the training when re-doing analysis

In [25]:
results_df = pd.read_parquet(comparison_file_path)

Validate that the support counts match the test counts

In [None]:
support_counts =[
    {key:value["support"] for key,value in results_df.report.iloc[i].items() if key not in ("accuracy", "macro avg", "weighted avg") }
    for i in [1,4,7]
]
train_test_counts.merge(
    pd.DataFrame(support_counts).T,
    how="outer",
    left_index=True,
    right_index=True,
)


Unnamed: 0,count,TSP1,not_TSP1,TSP2,not_TSP2,TSP14,not_TSP14,0,1,2
adventitial cell,0,0,0,0,0,0,0,,,
cardiac endothelial cell,1088,0,1088,0,1088,57,1031,0.0,0.0,0.0
ciliated epithelial cell,0,0,0,0,0,0,0,,,
conjunctival epithelial cell,0,0,0,0,0,0,0,,,
contractile cell,7916,243,7673,884,7032,633,7283,884.0,884.0,884.0
dendritic cell,0,0,0,0,0,0,0,,,
duct epithelial cell,0,0,0,0,0,0,0,,,
ecto-epithelial cell,0,0,0,0,0,0,0,,,
endo-epithelial cell,666,6,660,101,565,24,642,101.0,101.0,101.0
endothelial cell,6019,559,5460,2783,3236,1296,4723,2783.0,2783.0,2783.0


In [23]:
",".join(str(x) for x in train_test_counts[train_test_counts["count"] != 0].not_TSP14.to_list())

'1031,7283,642,4723,1337,343,5695,1684,1957,1508,4520,5137,5276,5685,13010,9461,5059,2414'

1088,7673,660,5460,1800,722,6762,518,5651,2498,5764,8332,8195,5500,12969,13636,5228,2693+

In [None]:
results_df[results_df.test_donor == "TSP2"]

Unnamed: 0,embed_name,test_donor,model,macro_avg_f1,weighted_avg_f1,train_size,test_size,report
1,tabula_sapiens_100k_scgpt_embedding,TSP2,LightGBM,0.604268,0.753227,77289,22711,"{'accuracy': 0.7355466514024042, 'cardiac endo..."
4,tabula_sapiens_100k_genept_embedding_original_...,TSP2,LightGBM,0.508016,0.698253,77289,22711,"{'accuracy': 0.6798908018140989, 'cardiac endo..."
7,tabula_sapiens_100k_genept_embedding_original_...,TSP2,LightGBM,0.530559,0.709601,77289,22711,"{'accuracy': 0.6945973316894897, 'cardiac endo..."
10,tabula_sapiens_100k_genept_embedding,TSP2,LightGBM,0.532596,0.714685,77289,22711,"{'accuracy': 0.7002773986174101, 'cardiac endo..."
13,tabula_sapiens_100k_genept_embedding_v2,TSP2,LightGBM,0.535761,0.720282,77289,22711,"{'accuracy': 0.7044603936418475, 'cardiac endo..."
16,tabula_sapiens_100k_genept_embedding_v3,TSP2,LightGBM,0.537372,0.719568,77289,22711,"{'accuracy': 0.7022588173131963, 'cardiac endo..."


In [23]:
(results_df['embed_name']
    .str.replace('tabula_sapiens_100k_', '')
    .str.replace('_embedding', ''))

0                                                 scgpt
1                                                 scgpt
2                                                 scgpt
3                              genept_original_ada_text
4                              genept_original_ada_text
5                              genept_original_ada_text
6                               genept_original_large_3
7                               genept_original_large_3
8                               genept_original_large_3
9                                                genept
10                                               genept
11                                               genept
12                                            genept_v2
13                                            genept_v2
14                                            genept_v2
15                                            genept_v3
16                                            genept_v3
17                                            ge

In [46]:
from plotly.subplots import make_subplots

# Create subplot figure
fig = make_subplots(
    rows=1, cols=2,
    horizontal_spacing=0.15
)

# Define name mapping
name_mapping = {
    'scgpt': 'scGPT',
    'genept_original_ada_text': 'GenePT ada (NCBI only)',
    'genept_original_large_3': 'GenePT large (NCBI only)',
    'genept': 'GenePT (gpt4o-mini) w/ Aging, Pathways, Drugs',
    'genept_v2': 'GenePT (gpt4o-mini) w/ Cell type, Pathways, Drugs',
    'genept_v3': 'GenePT (gpt4o-mini) w/ Cell type',
    'genept_cell_type_tissue_drug_pathway_openai_large': 'GenePT (gpt4o) w/ Cell type, Tissue, Drug, Pathway'
}

# Clean up embedding names and apply mapping
results_df['embed_name_clean'] = (results_df['embed_name']
    .str.replace('tabula_sapiens_100k_', '')
    .str.replace('_embedding', '')
    .map(name_mapping))

# Rest of the code remains the same
for i, metric in enumerate(['macro_avg_f1', 'weighted_avg_f1'], 1):
    temp_fig = px.bar(
        results_df,
        x='test_donor',
        y=metric,
        color='embed_name_clean',
        barmode='group',
        labels={
            'embed_name_clean': 'Embedding',
            metric: metric.replace("_", " ").title(),
            'test_donor': 'Test Donor'
        }
    )
    
    for trace in temp_fig.data:
        trace.showlegend = (i == 1)
        fig.add_trace(trace, row=1, col=i)

fig.update_layout(
    height=600,
    width=1000,
    showlegend=True,
    legend_title_text='Embedding',
    legend=dict(
        yanchor="top",    
        y=0.60,           # Changed from 0.35 to 0.45 to move legend higher
        xanchor="right",
        x=1.15,
        bgcolor='rgba(250, 250, 250, 0.95)',  # Semi-transparent white background
    )
)

fig.update_xaxes(tickangle=-45, title_text='Test Donor', row=1, col=1)
fig.update_xaxes(tickangle=-45, title_text='Test Donor', row=1, col=2)

fig.update_yaxes(title_text='Macro Average F1', row=1, col=1)
fig.update_yaxes(title_text='Weighted Average F1', row=1, col=2)

fig.show()

In [28]:
# Extract cell types and metrics from the nested report dictionary
results_list = []
for _, row in results_df[results_df.model != "KNN"].iterrows():
    report = row["report"]
    # Skip the aggregate metrics
    cell_types = [
        k for k in report.keys() if k not in ["accuracy", "macro avg", "weighted avg"]
    ]
    for cell_type in cell_types:
        metrics = report[cell_type]
        results_list.append(
            {
                "cell_type": f"{row['test_donor']} {cell_type} ({metrics['support']:.0f})",
                "embed_name": row["embed_name"],
                "model": row["model"],
                "test_donor": row["test_donor"],
                "precision": metrics["precision"],
                "recall": metrics["recall"],
                "f1-score": metrics["f1-score"],
                "support": metrics["support"],
            }
        )

# Convert to DataFrame and reshape for heatmap
results_flat = pd.DataFrame(results_list)
# First create separate pivots for each metric
metrics = ["precision", "recall", "f1-score"]
pivot_dfs = []

for metric in metrics:
    pivot = results_flat.pivot_table(
        columns="cell_type", index=["embed_name", "model"], values=metric
    )
    # Add metric name to index
    pivot.index = [f"{metric} {idx[0]} {idx[1]}" for idx in pivot.index]
    pivot_dfs.append(pivot)

# Concatenate all metric pivots
results_pivot = pd.concat(pivot_dfs)

In [30]:
results_pivot.to_csv(data_dir / "algorithm_comparison_pivot_v4.csv")

In [42]:
# First filter the rows
row_mask = (
    results_pivot.index.str.contains("scgpt", case=False) | 
    results_pivot.index.str.contains("tissue", case=False)
)
filtered_rows = results_pivot[row_mask]

# Then filter the columns
col_mask = filtered_rows.columns.str.contains("TSP1 ")
filtered_result = filtered_rows.loc[:, col_mask]

filtered_result

cell_type,TSP1 cardiac endothelial cell (0),TSP1 contractile cell (243),TSP1 endo-epithelial cell (6),TSP1 endothelial cell (559),TSP1 epithelial cell (10),TSP1 epithelial cell of lung (97),TSP1 fibroblast (233),TSP1 glandular epithelial cell (1470),TSP1 granulocyte (146),TSP1 innate lymphoid cell (9),TSP1 intestinal epithelial cell (0),TSP1 lymphocyte of b lineage (167),TSP1 myeloid leukocyte (370),TSP1 other (487),TSP1 stem cell (437),TSP1 stromal cell (44),TSP1 t cell (417),TSP1 transitional epithelial cell (156)
precision tabula_sapiens_100k_genept_embedding_cell_type_tissue_drug_pathway_openai_large LightGBM,0.0,0.67052,0.0,0.89781,0.0875,0.952941,0.444737,0.828183,0.873239,0.05036,0.0,0.961538,0.801587,0.251101,0.626943,0.125,0.865979,1.0
precision tabula_sapiens_100k_scgpt_embedding LightGBM,0.0,0.528958,0.03937,0.926877,0.25,0.988235,0.374677,0.547945,0.865248,0.056075,0.0,0.986928,0.833803,0.137339,0.678571,0.0,0.86646,1.0
recall tabula_sapiens_100k_genept_embedding_cell_type_tissue_drug_pathway_openai_large LightGBM,0.0,0.477366,0.0,0.880143,0.7,0.835052,0.725322,0.911565,0.849315,0.777778,0.0,0.898204,0.818919,0.234086,0.276888,0.022727,0.604317,0.435897
recall tabula_sapiens_100k_scgpt_embedding LightGBM,0.0,0.563786,0.833333,0.838998,0.7,0.865979,0.622318,0.136054,0.835616,0.666667,0.0,0.904192,0.8,0.459959,0.217391,0.0,0.669065,0.75641
f1-score tabula_sapiens_100k_genept_embedding_cell_type_tissue_drug_pathway_openai_large LightGBM,0.0,0.557692,0.0,0.888889,0.155556,0.89011,0.551387,0.867876,0.861111,0.094595,0.0,0.928793,0.81016,0.242295,0.384127,0.038462,0.711864,0.607143
f1-score tabula_sapiens_100k_scgpt_embedding LightGBM,0.0,0.545817,0.075188,0.880751,0.368421,0.923077,0.467742,0.217984,0.850174,0.103448,0.0,0.94375,0.816552,0.21152,0.329289,0.0,0.755074,0.861314


In [49]:
filtered_rows.to_csv(data_dir / "algorithm_comparison_pivot_v4_filtered.csv")