In [None]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML libraries  
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Single-cell libraries
import anndata as ad
import scanpy as sc

# Modlyn and LaminDB
import modlyn as mn
import lamindb as ln

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Setup
sns.set_theme()
%config InlineBackend.figure_formats = ['svg']

# Lamin tracking (keeping from original notebook)
project = ln.Project(name="ArrayLoader-Validation")
project.save()
ln.track(project="ArrayLoader-Validation")
run = ln.track()


In [None]:
# Load data from LaminDB 
print("Loading dataset from arrayloader-benchmarks...")
artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").get("RymV9PfXDGDbM9ek0000")
adata = artifact.load()

print(f"Loaded: {adata}")
print(f"Cell lines: {adata.obs['cell_line'].value_counts()}")

# Basic preprocessing
print("\nPreprocessing...")
# Filter cell lines with sufficient cells
min_cells = 10
keep_lines = adata.obs["cell_line"].value_counts()
keep_lines = keep_lines[keep_lines >= min_cells].index
adata = adata[adata.obs["cell_line"].isin(keep_lines)].copy()

# Apply log transformation
sc.pp.log1p(adata)
print(f"Final shape: {adata.shape}")
print(f"Cell lines: {adata.obs['cell_line'].nunique()}")


In [None]:
modlyn_model = mn.models.SimpleLogReg(
    adata=adata,
    label_column="cell_line",    
    learning_rate=1e-2,  
    weight_decay=0.3,
)

# Simple training with the high-level API
print("Training model...")
modlyn_model.fit(
    adata_train=adata[:int(0.8 * adata.n_obs)],
    adata_val=adata[int(0.8 * adata.n_obs):],
    train_dataloader_kwargs={
        "batch_size": 512,
        "num_workers": 0
    },
    max_epochs=100,
)
print("Training complete!")

df_modlyn = modlyn_model.get_weights()
print(f"Modlyn results shape: {df_modlyn.shape}")
print(f"Classes: {df_modlyn.index.tolist()}")
df_modlyn.head()


In [None]:
# Visualize training progress using high-level API
print("Creating training history visualization...")

# Show training losses using the high-level API
modlyn_model.plot_losses()


In [None]:
# Method 2: Sklearn LogisticRegression (for comparison)
X = adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X
le = LabelEncoder()
y = le.fit_transform(adata.obs["cell_line"])

n_train = int(0.8 * adata.n_obs)
X_train, X_val = X[:n_train], X[n_train:]
y_train, y_val = y[:n_train], y[n_train:]

print(f"Training data: {X_train.shape}")

sklearn_model = LogisticRegression(
    max_iter=1000,
    multi_class='ovr',  # One-vs-rest like modlyn
    solver='lbfgs',
    random_state=42
)
sklearn_model.fit(X_train, y_train)

df_sklearn = pd.DataFrame(
    sklearn_model.coef_,
    columns=adata.var_names,
    index=le.classes_,
)
df_sklearn.attrs["method_name"] = "sklearn_logreg"

print(f"Sklearn results shape: {df_sklearn.shape}")
print(f"Classes: {df_sklearn.index.tolist()}")


In [None]:
evaluator = mn.eval.CompareScores(
    dataframes=[df_modlyn, df_sklearn],
    n_top_values=[50, 100, 200]
)

# Generate Alex's weight correlation plot
print("Creating weight correlation visualization...")
fig, corr_df = evaluator.plot_weight_correlation(figsize=(12, 6))

print("\nDetailed correlation results:")
print(corr_df.head(10))

mean_correlation = corr_df['correlation'].mean()
print(f"\nFinal validation: {mean_correlation:.1%} correlation achieved!")


In [None]:
cell_line_categories = df_modlyn.index
correlations = []
comparison_data = []

for cell_line in cell_line_categories:
    if cell_line in df_sklearn.index:
        modlyn_weights = df_modlyn.loc[cell_line].values
        sklearn_weights = df_sklearn.loc[cell_line].values
        
        # Calculate correlation
        correlation = np.corrcoef(modlyn_weights, sklearn_weights)[0, 1]
        correlations.append(correlation)
        
        comparison_data.append({
            "cell_line": cell_line,
            "correlation": correlation,
            "modlyn_top_gene": df_modlyn.columns[np.argmax(np.abs(df_modlyn.loc[cell_line]))],
            "sklearn_top_gene": df_sklearn.columns[np.argmax(np.abs(df_sklearn.loc[cell_line]))],
            "modlyn_top_weight": np.max(np.abs(df_modlyn.loc[cell_line])),
            "sklearn_top_weight": np.max(np.abs(df_sklearn.loc[cell_line]))
        })

comparison_df = pd.DataFrame(comparison_data)
print(f"\nWeight correlations between methods:")
print(comparison_df[['cell_line', 'correlation', 'modlyn_top_gene', 'sklearn_top_gene']])

print(f"\nMean correlation: {np.mean(correlations):.4f}")
print(f"Min correlation: {np.min(correlations):.4f}")
print(f"Max correlation: {np.max(correlations):.4f}")

identical_threshold = 0.99
identical_count = sum(1 for corr in correlations if corr > identical_threshold)
print(f"\nResults with correlation > {identical_threshold}: {identical_count}/{len(correlations)}")

if identical_count == len(correlations):
    print("SUCCESS: All results are essentially identical!")
elif np.mean(correlations) > 0.95:
    print("Results are highly similar but not identical - may need hyperparameter tuning")
else:
    print("Results differ significantly - investigation needed")

# Visualize the comparison (your exact code)
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].hist(correlations, bins=20, alpha=0.7, edgecolor='black')
axes[0, 0].axvline(np.mean(correlations), color='red', linestyle='--', 
                   label=f'Mean: {np.mean(correlations):.3f}')
axes[0, 0].set_xlabel('Weight Correlation')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Modlyn vs Sklearn Weight Correlations')
axes[0, 0].legend()

first_cell_line = cell_line_categories[0]
if first_cell_line in df_sklearn.index:
    modlyn_w = df_modlyn.loc[first_cell_line].values
    sklearn_w = df_sklearn.loc[first_cell_line].values
    
    axes[0, 1].scatter(modlyn_w, sklearn_w, alpha=0.6, s=10)
    axes[0, 1].plot([modlyn_w.min(), modlyn_w.max()], 
                    [sklearn_w.min(), sklearn_w.max()], 'r--', alpha=0.8)
    axes[0, 1].set_xlabel('Modlyn Weights')
    axes[0, 1].set_ylabel('Sklearn Weights')
    axes[0, 1].set_title(f'Weight Comparison: {first_cell_line}')

top_n = 10
if first_cell_line in df_sklearn.index:
    modlyn_top_genes = df_modlyn.loc[first_cell_line].abs().nlargest(top_n).index.tolist()
    sklearn_top_genes = df_sklearn.loc[first_cell_line].abs().nlargest(top_n).index.tolist()
    
    overlap = len(set(modlyn_top_genes) & set(sklearn_top_genes))
    axes[1, 0].bar(['Modlyn Only', 'Overlap', 'Sklearn Only'], 
                   [top_n - overlap, overlap, top_n - overlap])
    axes[1, 0].set_title(f'Top {top_n} Gene Overlap: {first_cell_line}')
    axes[1, 0].set_ylabel('Gene Count')

y_train_pred_sklearn = sklearn_model.predict(X_train)
acc_sklearn = (y_train_pred_sklearn == y_train).mean()

with torch.no_grad():
    modlyn_pred = modlyn_model(torch.tensor(X_train, dtype=torch.float32))
    y_train_pred_modlyn = modlyn_pred.argmax(dim=1).numpy()
    acc_modlyn = (y_train_pred_modlyn == y_train).mean()

methods = ['Modlyn', 'Sklearn']
axes[1, 1].bar(methods, [acc_modlyn, acc_sklearn])
axes[1, 1].set_title('Training Accuracy Comparison')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

print(f"\nTraining Accuracies:")
print(f"Modlyn: {acc_modlyn:.4f}")
print(f"Sklearn: {acc_sklearn:.4f}")


In [None]:
## Compare with Scanpy methods
# Scanpy logistic regression
sc.tl.rank_genes_groups(adata, 'cell_line', method='logreg', key_added='sc_logreg')
df_scanpy_logreg = sc.get.rank_genes_groups_df(adata, group=None, key="sc_logreg").pivot(
    index='group', columns='names', values='scores'
)
df_scanpy_logreg.attrs["method_name"] = "scanpy_logreg"

# Scanpy Wilcoxon
sc.tl.rank_genes_groups(adata, 'cell_line', method='wilcoxon', key_added='sc_wilcoxon') 
df_scanpy_wilcoxon = sc.get.rank_genes_groups_df(adata, group=None, key="sc_wilcoxon").pivot(
    index='group', columns='names', values='scores'
)
df_scanpy_wilcoxon.attrs["method_name"] = "scanpy_wilcoxon"

print("Scanpy methods complete")


In [None]:
# Use modlyn.eval for comprehensive comparison
compare = mn.eval.CompareScores([df_modlyn, df_scanpy_logreg, df_scanpy_wilcoxon])
compare.compute_jaccard_comparison()
compare.plot_jaccard_comparison()

compare.plot_heatmaps()

In [None]:
ln.finish()