#### Fig4E - CC score
#### University of Cambridge
#### Mariana Quiroga Londoño
#### 20-02-20

In [1]:
#SETTING ENVIRONMENT 
import rpy2
#import tzlocal
#%load_ext rpy2.ipython

import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, dpi_save=1000)

  numba.core.entrypoints.init_all()



In [2]:
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list(name='gene_cmap', colors=['lightgrey', 'thistle', 'red', 'darkred']) 

#### Incorporate new annotations

In [3]:
#To calculate the cc score, I will need to use the adata that contains the cc genes we removed to generate the landscape! So it's not the result of h5da_part4, but part 3 instead.
#The adata from part 4 that doesn't include cc genes has a size of 35273 cells × 24153 genes.
#Whereas the adata from part 4 which has the same amount of cells, but includes cc genes has a size of: 35273 cells × 24247 genes
adata = sc.read("./h5da_part3/020221_mq224_part3_mRNA_Progenitors_only_postHarmony_DEG_FBMS2-3-H3-F3-E5_CB-G7_FL-A7-A6-C5.h5ad")

In [4]:
adata

AnnData object with n_obs × n_vars = 35273 × 24247
    obs: 'lanes', 'mad_prd', 'auto_prd', 'barcode', 'status', 'assignment', 'log_prob_singleton', 'log_prob_doublet', 'cluster0', 'cluster1', 'cluster2', 'Sample', 'Stage', 'Tissue', 'cluster3', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'louvain', 'leiden', 'old.broad_annotations'
    var: 'feature_types', 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Sample_colors', 'Stage_colors', 'Tissue_colors', 'assignment_colors', 'draw_graph', 'hvg', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_draw_graph_fa', 'X_orig_pca', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [5]:
#However, we do want the clusters generated in part 4 after annotating and sorting these annotations
#so we read those and incorporate them in adata.
adata_clhighres= sc.read("h5da_part4/050221_mq224_mRNA_Progenitors_only_postHarmony_ccfiltered_DEG_P4_sorted_annotations_FBMS2-3-H3-F3-E5_CB-G7_FL-A7-A6-C5.h5ad")

In [6]:
adata_clhighres

AnnData object with n_obs × n_vars = 35273 × 24153
    obs: 'lanes', 'mad_prd', 'auto_prd', 'barcode', 'status', 'assignment', 'log_prob_singleton', 'log_prob_doublet', 'cluster0', 'cluster1', 'cluster2', 'Sample', 'Stage', 'Tissue', 'cluster3', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'louvain', 'leiden', 'old.broad_annotations', 'leiden.P4', 'Cell.labels.P4', 'Cell.labels.P4.sorted'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell.labels.P4.sorted_colors', 'Cell.labels.P4_colors', 'Sample_colors', 'Stage_colors', 'Tissue_colors', 'assignment_colors', 'draw_graph', 'hvg', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_draw_graph_fa', 'X_orig_pca', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [7]:
adata.obs["Cell.labels.P4.sorted"]= adata_clhighres.obs["Cell.labels.P4.sorted"]

### Import cell cycle genes

In [8]:
#Score cell cycle and visualize the effect:
# load file in
cc_genes_file = './test/cell_cycle_makosco.csv'
cc_genes = pd.read_csv(cc_genes_file, delimiter=',')
# removing na from s and g2m list
s_genes = cc_genes['S'].dropna()
g2m_genes = cc_genes['G2/M'].dropna()

y_genes = s_genes.tolist()
no_trail = []

for x in y_genes:
    y = x.strip()
    no_trail.append(y)
    
s_genes = no_trail 

y_genes = g2m_genes.tolist()
no_trail = []

for x in y_genes:
    y = x.strip()
    no_trail.append(y)
    
g2m_genes = no_trail 

s_genes_mm = adata.var_names[np.isin(adata.var_names, s_genes)]

g2m_genes_mm = adata.var_names[np.isin(adata.var_names, g2m_genes)]
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes_mm, g2m_genes=g2m_genes_mm)
adata.obs["phase_combined"] = adata.obs["phase"].replace(["G2M", "S"], ["G2M/S", "G2M/S"])
adata.obs["S/G2M_score_combined"] = (adata.obs["G2M_score"] + adata.obs["S_score"])/2

calculating cell cycle phase
computing score 'S_score'
    finished: added
    'S_score', score of gene set (adata.obs).
    1471 total control genes are used. (0:00:05)
computing score 'G2M_score'
    finished: added
    'G2M_score', score of gene set (adata.obs).
    1469 total control genes are used. (0:00:06)
-->     'phase', cell cycle phase (adata.obs)


In [9]:
print(len(s_genes_mm))
print(len(g2m_genes_mm))

106
129


### Prepare metadata

In [10]:
# save cell cycle scores within metadata as a dataframe (called enrichement scores)
meta = pd.DataFrame(adata.obs)
enrichment_scores_df = meta[meta.columns[-2:]]
enrichment_scores_df["Tissue"] =meta["Tissue"] 
enrichment_scores_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.



Unnamed: 0,phase_combined,S/G2M_score_combined,Tissue
AAACGAACACTGGATT-1-0,G2M/S,0.181827,FBM
AAACGAAGTCCATAGT-1-0,G2M/S,-0.059521,FBM
AAACGCTGTTATGTGC-1-0,G1,-0.028844,FBM
AAACGCTTCATACAGC-1-0,G1,-0.021353,FBM
AACAACCTCCTGCTAC-1-0,G2M/S,0.004942,FBM
...,...,...,...
TTTGATCAGGTTCACT-1-6,G1,-0.082312,FL
TTTGGTTCAACACTAC-1-6,G2M/S,0.038933,FL
TTTGGTTCACCCTAGG-1-6,G2M/S,0.112441,FL
TTTGGTTGTCCCGGTA-1-6,G1,-0.063144,FL


In [14]:
useful_metadata = enrichment_scores_df
useful_metadata["assignment"] = meta["assignment"]
useful_metadata["phase"] = meta["phase"]
useful_metadata["Sample"] = meta["Sample"]
useful_metadata["Cell.labels.P4.sorted"] = meta["Cell.labels.P4.sorted"]
useful_metadata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Unnamed: 0,phase_combined,S/G2M_score_combined,Tissue,assignment,phase,Sample,Cell.labels.P4.sorted
AAACGAACACTGGATT-1-0,G2M/S,0.181827,FBM,F,G2M,SIGAF3,Undetermined
AAACGAAGTCCATAGT-1-0,G2M/S,-0.059521,FBM,F,S,SIGAF3,LyP I (CLP)
AAACGCTGTTATGTGC-1-0,G1,-0.028844,FBM,F,G1,SIGAF3,LyP II (pro-B)
AAACGCTTCATACAGC-1-0,G1,-0.021353,FBM,F,G1,SIGAF3,LyP I (CLP)
AACAACCTCCTGCTAC-1-0,G2M/S,0.004942,FBM,F,S,SIGAF3,LyP II (pro-B)
...,...,...,...,...,...,...,...
TTTGATCAGGTTCACT-1-6,G1,-0.082312,FL,A,G1,SIGAC5,HSC/MPP II
TTTGGTTCAACACTAC-1-6,G2M/S,0.038933,FL,A,G2M,SIGAC5,LyP II (pro-B)
TTTGGTTCACCCTAGG-1-6,G2M/S,0.112441,FL,A,G2M,SIGAC5,Early LyP
TTTGGTTGTCCCGGTA-1-6,G1,-0.063144,FL,A,G1,SIGAC5,HSC/MPP II


In [15]:
enrichment_scores_df.to_csv("05022021_mq224_ccscores_FL-FBM-CB_metadata.csv")

#Piece of code to run in R to reproduce the heatmap

library(ggplot2)
library(data.table)
library(RColorBrewer)
data = fread('./05022021_mq224_ccscores_FL-FBM-CB_metadata.csv')

colnames(data) = c('cellid', 'phase_comb', 'SG2Mscore', 'Tissue', 'assignment', 'phase', 'Sample', 'Cell.labels.P4.sorted')

dataS = data[,.(count = .N, meanscore = mean(SG2Mscore)), by = .(Cell.labels.P4.sorted, Tissue)]
dataSNA = dataS[count < 20, meanscore := NA]

level_order <- c("HSC/MPP I","HSC/MPP II","HSC/MPP III","HSC/MPP IV","MEP","MEP/MkP","EryP I",
                 "EryP II","EryP III", "EryP IV", "Late EryP I (Pro-erythroblast)","Late EryP II (Erythroblast)","EoBasoMC", "MyP", "Early LyP","LyP I (CLP)", "LyP II (pro-B)",
                 "LyP III (pro-B)", "LyP IV (pre-B)","DC progenitor I","DC progenitor II", "Undetermined")
g = ggplot(dataSNA, aes(x=factor(Cell.labels.P4.sorted, level = level_order), y= Tissue, fill= meanscore)) + 
  geom_tile() + scale_fill_distiller(palette = "RdYlBu",na.value="#C0C0C0") + 
  theme_classic() + 
  theme(axis.line=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  scale_y_discrete(expand=c(0,0)) +
  scale_x_discrete(expand=c(0,0))

ggsave('050221_mq224_FL-FBM-CB-G2M_meanscore_withNAs.pdf', g+xlab("Cell types"), height = 4)
