# Import libraries

In [None]:
import numpy as np
import scanpy as sc
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway
import phate, scprep, session_info
from statsmodels.stats.multicomp import MultiComparison

In [None]:
%matplotlib inline

# Load data

In [None]:
adata = sc.read_h5ad("../_m/at2.hlca_core.dataset.h5ad")
adata

In [None]:
hlca_counts = pd.DataFrame(adata.X.toarray(), 
                           columns=[adata.var.feature_name],
                           index=adata.obs.index)
print(hlca_counts.shape)
hlca_counts.head(2)

In [None]:
scprep.plot.plot_library_size(hlca_counts)

# Preprocessing: Filtering, Normalizing, and Transforming

## Filtering

This data, however, has already had very extensive filtering and QC. Based on the core model, there are no high mitochondria percentages and few outliers. So, we will skip the outlier
step as this eliminates very few cells (< 1%).

In [None]:
hlca_counts = scprep.filter.filter_rare_genes(hlca_counts, cutoff=0, min_cells=6)
hlca_counts.shape

## Normalization

In [None]:
hlca_counts = scprep.normalize.library_size_normalize(hlca_counts)
hlca_counts.head(2)

## Transformation

In [None]:
hlca_counts = scprep.transform.sqrt(hlca_counts)

# PHATE

In [None]:
phate_operator = phate.PHATE(n_jobs=-2)
Y_phate = phate_operator.fit_transform(hlca_counts)

    Calculated affinities in 0.21 seconds.


  Calculated graph and diffusion operator in 7.17 seconds.


  Calculating optimal t...


    Automatically selected t = 15


  Calculated optimal t in 1.54 seconds.


  Calculating diffusion potential...


  Calculated diffusion potential in 0.66 seconds.


  Calculating metric MDS...


  Calculated metric MDS in 3.11 seconds.


Calculated PHATE in 12.49 seconds.


In [None]:
scprep.plot.scatter2d(Y_phate, ticks=None, label_prefix="PHATE", figsize=(5,5),
                      filename="at2_phate_clustering.pdf", dpi=300)

# Plotting

## Angiotensin II receptor 1 (AGTR2)

In [None]:
# Plotting mitochondrial expression
fig, ax = plt.subplots(1, figsize=(4,4))
scprep.plot.scatter2d(Y_phate, ax=ax, c=hlca_counts['AGTR2'],
                      title='AGTR2', ticks=False, label_prefix='PHATE',
                      filename="at2_phate_clustering.AGTR2.pdf", dpi=300)
fig.tight_layout()

## KDE plot of AT2

In [None]:
fig, ax = plt.subplots(1, figsize=(4,4))
kde_plot = sns.kdeplot(x=Y_phate[:, 0], y=Y_phate[:, 1], fill=True, ax=ax, zorder=0, 
                       n_levels=100, cmap="inferno")

ax.set_xticks([]); ax.set_yticks([])
ax.set_xlabel('PHATE 1', fontsize=18); ax.set_ylabel('PHATE 2', fontsize=18)
ax.set_title('KDE - AT2', fontsize=20)
fig.tight_layout()

fig = kde_plot.get_figure()
fig.savefig("at2_phate_kde.pdf") 

## Clustering data using PHATE

In [None]:
clusters = phate.cluster.kmeans(phate_operator, n_clusters=6)

scprep.plot.scatter2d(Y_phate, c=clusters, s=1,figsize=(4.3,4), ticks=None, label_prefix='PHATE',
                      legend_anchor=(1,1), fontsize=12, title='PHATE clusters',
                      filename="at2_phate_clustering.kmeans.pdf", dpi=300)

fig.tight_layout()

# Comparison of PHATE clusters and AGTR2 expressoin

## Subset data

In [None]:
gene_list = []
for item in hlca_counts.columns:
    gene_list.append(str(item).replace("('", "").replace("',)", ""))

adata = adata[hlca_counts.index, adata.var.feature_name.isin(gene_list)]
adata.obsm["X_phate"] = Y_phate
adata.obs["phate_clusters"] = ["Group_"+str(clu) for clu in clusters]
adata

## Qualitative review

In [None]:
hlca_counts.loc[:, "PHATE"] = clusters
hlca_counts.loc[:, "patient"] = adata.obs["donor_id"]
df = pl.from_pandas(hlca_counts.loc[:, ["AGTR2", "PHATE", "patient"]])\
       .rename({"('AGTR2',)": "AGTR2", "('PHATE',)": "PHATE", 
                "('patient',)": "patient"})

df.group_by("PHATE").agg([pl.sum("AGTR2").name.suffix("_sum"), 
                          pl.mean("AGTR2").name.suffix("_mean"),
                          pl.count("AGTR2").name.suffix("_count"),
                          pl.std("AGTR2").name.suffix("_std")])\
  .sort("PHATE")

In [None]:
dx = df.group_by(["PHATE", "patient"]).agg(pl.mean("AGTR2"))
ax = sns.boxplot(data=dx, x="PHATE", y="AGTR2", fill=False, width=0.5)
box_plot = sns.stripplot(data=dx, x="PHATE", y="AGTR2", ax=ax)
ax.set_xlabel('PHATE Clusters', fontsize=12); 
ax.set_ylabel('Normalized Expression (AGTR2)', fontsize=12)
fig.tight_layout()

fig = box_plot.get_figure()
fig.savefig("at2_phate.boxplot_AGTR2.pdf") 

## Statistical comparison

In [None]:
c0 = dx.filter(pl.col("PHATE") == 0).to_pandas().AGTR2.values
c1 = dx.filter(pl.col("PHATE") == 1).to_pandas().AGTR2.values
c2 = dx.filter(pl.col("PHATE") == 2).to_pandas().AGTR2.values
c3 = dx.filter(pl.col("PHATE") == 3).to_pandas().AGTR2.values
c4 = dx.filter(pl.col("PHATE") == 4).to_pandas().AGTR2.values
c5 = dx.filter(pl.col("PHATE") == 5).to_pandas().AGTR2.values

f_oneway(c0, c1, c2, c3, c4, c5)

In [None]:
model = MultiComparison(hlca_counts.loc[:, "AGTR2"], hlca_counts.loc[:, "PHATE"])
res_mod = model.tukeyhsd()
print(res_mod)

In [None]:
pd.DataFrame(data=res_mod._results_table.data[1:], 
             columns=res_mod._results_table.data[0])\
  .to_csv("at2_phate_clustering.tukeyhsd.tsv", 
           sep="\t", index=False)

## Choose a root cell for diffusion pseudotime

In [None]:
adata.uns["iroot"] = np.flatnonzero(adata.obs["leiden"] == '11')[0]

## Compute diffusion pseudotime

In [None]:
sc.tl.dpt(adata)
sc.pl.draw_graph(adata, color=["phate_clusters", "ENSG00000180772", "dpt_pseudotime"], 
                 legend_loc="on data", save=".at2_phate.diff_pseudotime.pdf")

# Session information

In [None]:
session_info.show()