# Import packages and data 

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.axes._axes import _log as matplotlib_axes_logger
from scipy import sparse
matplotlib_axes_logger.setLevel('ERROR')

In [None]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

In [None]:
adata = sc.read('raw_data.h5ad')

In [None]:
adata

# Pre-process and add dr

In [None]:
adata

In [None]:
# normalise count data 
sc.pp.normalize_total(adata)

In [None]:
# log the count data 
sc.pp.log1p(adata)

In [None]:
# find variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) 

In [None]:
# scale the data 
sc.pp.scale(adata, max_value=10)

In [None]:
# scale the negative values in adata.X (which have been normalised, logged and scaled) to zero 
adata.X = np.where(adata.X < 0, 0, adata.X)

In [None]:
# run pca
sc.tl.pca(adata, n_comps=50)

In [None]:
# have a look at pc's in order to choose number of pcs that will be used downstream 
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
# run snn neighbourhood graph
sc.pp.neighbors(adata, n_neighbors=40, n_pcs=10) 

In [None]:
# run umap
sc.tl.umap(adata) 

In [None]:
sc.pl.umap(adata, color=['cell.labels', 'orig.ident'], size=3) 

In [None]:
# run fdg  
sc.tl.draw_graph(adata, init_pos='X_pca', layout='fa')

In [None]:
# run diffmap
sc.tl.diffmap(adata, n_comps=10)

In [None]:
## PCA
sc.pl.pca(adata, color='cell.labels', save='_pre-harmony_cell_labels_legoff.pdf', show=False)

## UMAP
sc.pl.umap(adata, color='cell.labels', save='_pre-harmony_cell_labels_legoff.pdf', show=False, size=3) 
sc.pl.umap(adata, color='cell.labels', save='_pre-harmony_cell_labels_legon.pdf', show=False, size=3, legend_loc="on data", legend_fontsize=6)

## FDG
sc.pl.draw_graph(adata, color='cell.labels', layout='fa', show=False, save='_pre-harmony_cell_labels_legoff.pdf')
sc.pl.draw_graph(adata, color='cell.labels', legend_loc='on data', show=False, save='_pre-harmony_cell_labels_legon.pdf', legend_fontsize=6)

## DIFFUSION MAP
sc.pl.scatter(adata, basis='diffmap', show=False, save='_pre-harmony_cell_labels_legoff.pdf', color='cell.labels') 

In [None]:
# plot pre-harmony umaps coloured by different variables

## UMAP by cell labels
sc.pl.umap(adata, color='cell.labels', save='_pre-harmony_cell_labels_legoff.pdf', size=3, show=False) 
sc.pl.umap(adata, color='cell.labels', save='_pre-harmony_cell_labels_legon.pdf', size=3, show=False, legend_loc="on data", legend_fontsize=6)

## UMAP by sequencing type 
sc.pl.umap(adata, color='sequencing.type', save='_pre-harmony_seq_type_legoff.pdf', size=3, show=False) 

## UMAP by sample 
sc.pl.umap(adata, color='orig.ident', save='_pre-harmony_sample_type_legoff.pdf', size=3, show=False) 

# Run harmony batch correction

In [None]:
#harmony prep - save pca and batch (sample) metadata columns 
pca = adata.obsm['X_pca']
batch = adata.obs['orig.ident']

In [None]:
# load up harmony in rpy2
%load_ext rpy2.ipython

In [None]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)

hem = HarmonyMatrix(pca, batch, theta=3, verbose=TRUE, do_pca=FALSE)
hem = data.frame(hem)

In [None]:
# save old pca values or orig.pca and save batch corrected pca results in anndata object as x_pca
adata.obsm['X_orig_pca'] = adata.obsm['X_pca']
adata.obsm['X_pca'] = hem.values

# Add dimensional reduction post-harmony

In [None]:
# run snn neighbourhood graph
sc.pp.neighbors(adata, n_neighbors=40, n_pcs=10) 

In [None]:
# run umap
sc.tl.umap(adata) 

In [None]:
#plot umap post harmony
sc.pl.umap(adata, color=['cell.labels', 'orig.ident'], size=3)

In [None]:
# run fdg 
sc.tl.draw_graph(adata, init_pos='X_pca', layout='fa')

In [None]:
# run diffmap
sc.tl.diffmap(adata, n_comps=10)

In [None]:
## PCA
sc.pl.pca(adata, color='cell.labels', save='_post-harmony_cell_labels_legoff.pdf', show=False)

## UMAP
sc.pl.umap(adata, color='cell.labels', save='_post-harmony_cell_labels_legoff.pdf', show=False, size=3) 
sc.pl.umap(adata, color='cell.labels', save='_post-harmony_cell_labels_legon.pdf', show=False, size=3, legend_loc="on data", legend_fontsize=6)

## FDG
sc.pl.draw_graph(adata, color='cell.labels', layout='fa', show=False, save='_post-harmony_cell_labels_legoff.pdf')
sc.pl.draw_graph(adata, color='cell.labels', legend_loc='on data', show=False, save='_post-harmony_cell_labels_legon.pdf', legend_fontsize=6)

## DIFFUSION MAP
sc.pl.scatter(adata, basis='diffmap', show=False, save='_post-harmony_cell_labels_legoff.pdf', color='cell.labels') 

In [None]:
# plot post-harmony umaps coloured by different variables

## UMAP by cell labels
sc.pl.umap(adata, color='cell.labels', save='_post-harmony_cell_labels_legoff.pdf', size=3, show=False) 
sc.pl.umap(adata, color='cell.labels', save='_post-harmony_cell_labels_legon.pdf', show=False, size=3, legend_loc="on data", legend_fontsize=6)

## UMAP by sequencing type 
sc.pl.umap(adata, color='sequencing.type', save='_post-harmony_seq_type_legoff.pdf', show=False, size=3) 

## UMAP by sample 
sc.pl.umap(adata, color='orig.ident', save='_post-harmony_sample_type_legoff.pdf', show=False, size=3) 

# Perform leiden clustering

In [None]:
sc.pl.umap(adata, color=['cell.labels', 'orig.ident'], legend_loc="on data", size=3, legend_fontsize=5)

In [None]:
sc.tl.leiden(adata, resolution=2, random_state=26, n_iterations=-1)

sc.pl.umap(adata, color='leiden', legend_loc="on data", size=3, legend_fontsize=15, title="leiden at res 2")
leiden_res_2 = len(adata.obs["leiden"].unique())

# plot a dendogram to show relationships between clusters
sc.tl.dendrogram(adata, groupby='leiden', n_pcs=20, use_rep='X_pca') 
sc.pl.dendrogram(adata, groupby='leiden', save='dendogram_for_initial_leiden_clusters.pdf')

leiden_res_2

## Restrict leiden clustering to subset

In [None]:
sc.tl.leiden(adata, resolution=0.5, random_state=26, n_iterations=-1, restrict_to=("leiden", ["21"]))
sc.pl.umap(adata, color='leiden', legend_loc="on data", size=3, legend_fontsize=15, title="")

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
cell_numbers = adata.obs.groupby(["leiden", "cell.labels"]).apply(len)
cell_numbers

# Calculate differentially expressed genes

In [None]:
# run degs on normalised and logged data (saved as adata.raw), not on additionally scaled data (saved as adata.X).
# calculate degs using wilcoxon rank sum test with benjamini-hochberg correction. 
#based on ln transformed count data 
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon', use_raw=False, log_transformed=True)

In [None]:
# filter the degs for those which are expressed in at least 25% of cells in cluster. log2fc of the ln-transformed
# data will be shown.
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.25, min_fold_change=0.25, use_raw=False)

In [None]:
# save df for unfiltered degs
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
degs_by_cluster = pd.DataFrame({group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'logfoldchanges']})

# the degs_by_cluster df shows the log2 fold change for each gene ordered by z-score underlying the computation 
# of a p-value for each gene for each group
degs_by_cluster.to_csv("degs_by_cluster.csv")
degs_by_cluster[:10]

In [None]:
# save df for filtered degs
result = adata.uns['rank_genes_groups_filtered']
groups = result['names'].dtype.names
degs_by_cluster_filtered = pd.DataFrame({group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'logfoldchanges']})
# the degs_by_cluster df shows the log2 fold change for each gene ordered by z-score underlying the computation 
# of a p-value for each gene for each group
degs_by_cluster_filtered.to_csv("degs_by_cluster_filtered.csv")
degs_by_cluster_filtered[:10]

# Add in annotations

In [None]:
# read in annotation.csv which contains: 'Cluster' column for leiden clusters and 'Category' column for annotations
annotation = pd.read_csv("annot.csv", index_col=False)

In [None]:
# convert the clusters column to a string (so that it can be input into the replace function, which acts on strings)
annotation.Cluster = annotation.Cluster.apply(str)
# make a temp metadata column for new cell labels from the leiden clusters
adata.obs["new.cell.labels"] = adata.obs["leiden"]

In [None]:
# save the clusters/categories columns as named lists 
clusters = annotation["Cluster"].tolist()
categories = annotation["Category"].tolist()
# replace all values in new.cell.labels column matchin clusters values with categories values
adata.obs["new.cell.labels"] = adata.obs["new.cell.labels"].replace(clusters, categories)

In [None]:
sc.pl.umap(adata, color=['leiden', 'cell.labels'], legend_loc="on data", size=20, legend_fontsize=5)

# Save the data

In [None]:
# save (again because of error) the adata with dr, leiden and degs saved 
adata.write('data.h5ad')