# 14 DGE TBI

## Notebook setup

In [1]:
import warnings
warnings.filterwarnings('ignore')

import scanpy as sc
import scanpy.external as sce
import numpy as np
import pandas as pd
import warnings, scipy.sparse as sp, matplotlib, matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.pyplot import rc_context
from collections import Counter
import matplotlib.font_manager
import openpyxl
import pyreadr
import rpy2
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
#import magic
#import seaborn as sns
import palantir
import loompy
import feather
import re
#from scipy.sparse import csgraph

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.rc('font', size=14)
import matplotlib.lines as lines

pd.set_option('display.max_rows', 200)

sc.set_figure_params(dpi=80, dpi_save=300, color_map='Spectral_r', vector_friendly=True, transparent=True)
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()

findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.20.2 scipy==1.5.3 pandas==1.4.4 scikit-learn==1.1.2 statsmodels==0.13.2 python-igraph==0.9.11 pynndescent==0.5.7


In [2]:
user_defined_palette =  [ '#F6222E', '#FEAF16','#3283FE','#BDCDFF', '#3B00FB', '#F8A19F', '#1CFFCE',  '#C4451C', 
                          '#2ED9FF', '#c1c119', '#8b0000', '#FE00FA', '#1CBE4F','#B5EFB5', '#0e452b', '#AA0DFE']

In [3]:
user_defined_cmap_markers = LinearSegmentedColormap.from_list('mycmap', ["#E6E6FF", "#CCCCFF", "#B2B2FF", "#9999FF",  "#6666FF",   "#3333FF", "#0000FF"])
user_defined_cmap_degs = LinearSegmentedColormap.from_list('mycmap', ["#0000FF", "#3333FF", "#6666FF", "#9999FF", "#B2B2FF", "#CCCCFF", "#E6E6FF", "#E6FFE6", "#CCFFCC", "#B2FFB2", "#99FF99", "#66FF66", "#33FF33", "#00FF00"])

In [4]:
%matplotlib inline 

## Load data for Figure 5

In [5]:
path_to_h5ad = '../output/anndata_io/Fig5ij.h5ad'

In [6]:
adata_d0147 = sc.read_h5ad(path_to_h5ad)
adata_d0147.uns['log1p']["base"] = None

### d4 vs d1 (02mo)

In [None]:
adata_d0147.obs['day_and_subset'] = ''
adata_d0147.obs['day_and_subset'] = adata_d0147.obs[['day', 'cell_type_subset']].agg('_'.join, axis=1)

In [None]:
adata_d0147_02mo = adata_d0147[adata_d0147.obs['stage']=='02mo']

In [None]:
writer = pd.ExcelWriter('adata_d0147_02mo_d4vsd1_wilcox.xlsx', engine='xlsxwriter')

for subset in ["0:arEC", "1:capEC", "2:venEC", "3:capsFB", "4:intFB", "5:medFB", "6:MEC", "7:vSMC/PC", "8:nmSC","9:Fat", "10:aaTEC1",  "11:aaTEC2",           
               "12:cTEC", "12:early Pr", "13:mTEC1", "14:mTEC-prol", "15:mTEC2", "16:mimetic(basal)", "17:mimetic(tuft)", "18:mimetic(neuroendo)", "19:mimetic(goblet)", "20:mimetic(microfold)"]:   
    sc.tl.rank_genes_groups(adata_d0147_02mo, 'day_and_subset', groups=['d4_'+subset], reference='d1_'+subset, method='wilcoxon', use_raw=False)
    result = adata_d0147_02mo.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame(
        {group + '_' + key[:1]: result[key][group]
        for group in groups for key in ['names', 'scores', 'logfoldchanges', 'pvals_adj']}).to_excel(writer, sheet_name=re.search('.*:(.+)', subset).group(1).replace('/', '-'))
        
writer.save()

### d7 vs d1 (02mo)

In [None]:
writer = pd.ExcelWriter('adata_d0147_02mo_d7vsd1_wilcox.xlsx', engine='xlsxwriter')

for subset in ["0:arEC", "1:capEC", "2:venEC", "3:capsFB", "4:intFB", "5:medFB", "6:MEC", "7:vSMC/PC", "8:nmSC","9:Fat", "10:aaTEC1",  "11:aaTEC2",           
               "12:cTEC", "12:early Pr", "13:mTEC1", "14:mTEC-prol", "15:mTEC2", "16:mimetic(basal)", "17:mimetic(tuft)", "18:mimetic(neuroendo)", "19:mimetic(goblet)", "20:mimetic(microfold)"]:   
    sc.tl.rank_genes_groups(adata_d0147_02mo, 'day_and_subset', groups=['d7_'+subset], reference='d1_'+subset, method='wilcoxon', use_raw=False)
    result = adata_d0147_02mo.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame(
        {group + '_' + key[:1]: result[key][group]
        for group in groups for key in ['names', 'scores', 'logfoldchanges', 'pvals_adj']}).to_excel(writer, sheet_name=re.search('.*:(.+)', subset).group(1).replace('/', '-'))
        
writer.save()

### d4 vs d1 (18mo)

In [None]:
adata_d0147_18mo = adata_d0147[adata_d0147.obs['stage']=='18mo']

In [None]:
writer = pd.ExcelWriter('adata_d0147_18mo_d4vsd1_wilcox.xlsx', engine='xlsxwriter')

for subset in ["0:arEC", "1:capEC", "2:venEC", "3:capsFB", "4:intFB", "5:medFB", "6:MEC", "7:vSMC/PC", "8:nmSC","9:Fat", "10:aaTEC1",  "11:aaTEC2",           
               "12:cTEC", "12:early Pr", "13:mTEC1", "14:mTEC-prol", "15:mTEC2", "16:mimetic(basal)", "17:mimetic(tuft)", "18:mimetic(neuroendo)", "19:mimetic(goblet)", "20:mimetic(microfold)"]:   
    sc.tl.rank_genes_groups(adata_d0147_18mo, 'day_and_subset', groups=['d4_'+subset], reference='d1_'+subset, method='wilcoxon', use_raw=False)
    result = adata_d0147_18mo.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame(
        {group + '_' + key[:1]: result[key][group]
        for group in groups for key in ['names', 'scores', 'logfoldchanges', 'pvals_adj']}).to_excel(writer, sheet_name=re.search('.*:(.+)', subset).group(1).replace('/', '-'))
        
writer.save()

### d7 vs d1 (18mo)

In [None]:
writer = pd.ExcelWriter('adata_d0147_18mo_d7vsd1_wilcox.xlsx', engine='xlsxwriter')

for subset in ["0:arEC", "1:capEC", "2:venEC", "3:capsFB", "4:intFB", "5:medFB", "6:MEC", "7:vSMC/PC", "8:nmSC","9:Fat", "10:aaTEC1",  "11:aaTEC2",           
               "12:cTEC", "12:early Pr", "13:mTEC1", "14:mTEC-prol", "15:mTEC2", "16:mimetic(basal)", "17:mimetic(tuft)", "18:mimetic(neuroendo)", "19:mimetic(goblet)", "20:mimetic(microfold)"]:   
    sc.tl.rank_genes_groups(adata_d0147_18mo, 'day_and_subset', groups=['d7_'+subset], reference='d1_'+subset, method='wilcoxon', use_raw=False)
    result = adata_d0147_18mo.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame(
        {group + '_' + key[:1]: result[key][group]
        for group in groups for key in ['names', 'scores', 'logfoldchanges', 'pvals_adj']}).to_excel(writer, sheet_name=re.search('.*:(.+)', subset).group(1).replace('/', '-'))
        
writer.save()

### Dotchart using score and fdr from wilcox

In [None]:
%load_ext rpy2.ipython

In [None]:
%R if (!require("pacman")) install.packages("pacman")
%R pacman::p_load(scales, data.table, openxlsx, ggplot2, ggpubr, RColorBrewer, dichromat, readxl, pheatmap, dplyr, stringr, parallel)

In [None]:
%%R 

# read in all available excel sheet names 
wilcox_results = excel_sheets('adata_d0147_02mo_d7_wilcox.xlsx')

rnk_items_list = NULL

for (item in wilcox_results) {
      wilcox_result <- read_excel('adata_d0147_02mo_d7_wilcox.xlsx', sheet = item)
      rnk_item = na.omit(wilcox_result[,c(2,3)])
      rnk_item_sorted = rnk_item[order(rnk_item[,2], decreasing = TRUE),]
      colnames(rnk_item_sorted)[1] = '#primerid' # comment out header
      colnames(rnk_item_sorted)[2] = '#rank_score' # comment out header
      rnk_items_list[[item]] = rnk_item_sorted
      write.table(rnk_item_sorted, file = paste0('../output/gsea_items/input_ranks/d7d4vsd1/wilcox_result_', item, '_d7vsd1_02mo.rnk'), sep='\t', row.names = FALSE, quote = FALSE)
}

# read in all available excel sheet names 
wilcox_results = excel_sheets('adata_d0147_02mo_d4_wilcox.xlsx')

rnk_items_list = NULL

for (item in wilcox_results) {
      wilcox_result <- read_excel('adata_d0147_02mo_d4_wilcox.xlsx', sheet = item)
      rnk_item = na.omit(wilcox_result[,c(2,3)])
      rnk_item_sorted = rnk_item[order(rnk_item[,2], decreasing = TRUE),]
      colnames(rnk_item_sorted)[1] = '#primerid' # comment out header
      colnames(rnk_item_sorted)[2] = '#rank_score' # comment out header
      rnk_items_list[[item]] = rnk_item_sorted
      write.table(rnk_item_sorted, file = paste0('../output/gsea_items/input_ranks/d7d4vsd1/wilcox_result_', item, '_d4vsd1_02mo.rnk'), sep='\t', row.names = FALSE, quote = FALSE)
}


# read in all available excel sheet names 
wilcox_results = excel_sheets('adata_d0147_18mo_d7_wilcox.xlsx')

rnk_items_list = NULL

for (item in wilcox_results) {
      wilcox_result <- read_excel('adata_d0147_18mo_d7_wilcox.xlsx', sheet = item)
      rnk_item = na.omit(wilcox_result[,c(2,3)])
      rnk_item_sorted = rnk_item[order(rnk_item[,2], decreasing = TRUE),]
      colnames(rnk_item_sorted)[1] = '#primerid' # comment out header
      colnames(rnk_item_sorted)[2] = '#rank_score' # comment out header
      rnk_items_list[[item]] = rnk_item_sorted
      write.table(rnk_item_sorted, file = paste0('../output/gsea_items/input_ranks/d7d4vsd1/wilcox_result_', item, '_d7vsd1_18mo.rnk'), sep='\t', row.names = FALSE, quote = FALSE)
}


# read in all available excel sheet names 
wilcox_results = excel_sheets('adata_d0147_18mo_d4_wilcox.xlsx')

rnk_items_list = NULL

for (item in wilcox_results) {
      wilcox_result <- read_excel('adata_d0147_18mo_d4_wilcox.xlsx', sheet = item)
      rnk_item = na.omit(wilcox_result[,c(2,3)])
      rnk_item_sorted = rnk_item[order(rnk_item[,2], decreasing = TRUE),]
      colnames(rnk_item_sorted)[1] = '#primerid' # comment out header
      colnames(rnk_item_sorted)[2] = '#rank_score' # comment out header
      rnk_items_list[[item]] = rnk_item_sorted
      write.table(rnk_item_sorted, file = paste0('../output/gsea_items/input_ranks/d7d4vsd1/wilcox_result_', item, '_d4vsd1_18mo.rnk'), sep='\t', row.names = FALSE, quote = FALSE)
}

In [None]:
%%R 

wilcox_results_combined = NULL

for (selected_comparison in c('adata_d0147_02mo_d4vsd1_wilcox.xlsx', 'adata_d0147_02mo_d7vsd1_wilcox.xlsx', 'adata_d0147_18mo_d4vsd1_wilcox.xlsx', 'adata_d0147_18mo_d7vsd1_wilcox.xlsx')) {
    wilcox_results = excel_sheets(selected_comparison)
    for (item in wilcox_results) {
        wilcox_result <- read_excel(selected_comparison, sheet = item)
        colnames(wilcox_result) <- c('index', 'name', 'score', 'log2_fc', 'p_adj')
        wilcox_result$p_adj[wilcox_result$p_adj == 0] <- min(wilcox_result$p_adj[wilcox_result$p_adj>0])
        wilcox_result$`-log10(p_adj)` = (-log(wilcox_result$p_adj, 10))
        wilcox_result$subset = item
        wilcox_result$comparison = selected_comparison
        wilcox_result_sorted = wilcox_result[order(wilcox_result$score, decreasing = TRUE),]
        wilcox_result_sorted = wilcox_result_sorted[wilcox_result_sorted$p_adj<=0.05,]
        wilcox_results_combined = bind_rows(wilcox_results_combined, wilcox_result_sorted) # select # of top genes per subset 
    }
}

In [None]:
%%R

L <- c('Foxn1', 'Dll4', 'Cxcl12',	'Ccl19',	'Ccl21a',	'Ccl25','Fgf7', 	'Fgf1',	'Fgf2',		'Fgf10',	'Fgf18',	'Fgf21',	'Bmp4',	'Bmp7',	'Flt3l',	'Kitl')
tt = wilcox_results_combined[wilcox_results_combined$name %in% L,]

tt$name <- factor(tt$name, levels = rev(c('Bmp7',	'Bmp4',	'Flt3l',	'Kitl',  'Fgf18', 'Fgf7', 'Fgf2',	'Fgf10', 'Fgf1','Fgf21', 'Dll4', 'Cxcl12',	'Ccl19',	'Ccl21a',	'Ccl25', 'Foxn1')))
tt$subset <- factor(tt$subset, levels = c('capsFB', 'intFB', 'medFB', 'MEC', 'vSMC-PC', 'nmSC' , 'arEC', 'capEC', 'venEC', 'aaTEC1', 'early Pr', 'cTEC',  'mTEC1', 'mTEC-prol', 'mTEC2'))

tt$squishedZ = squish(tt$score, range=c(-3, 3), only.finite=TRUE)


In [None]:
%%R -w 24 -h 24 -u cm

pdf("dotplot_Zscore_d74vsd1_L_alt.pdf", width=6.75, height=7.15)

print(ggdotchart(tt, x='name', y='subset', group = 'subset',rotate=TRUE, color='squishedZ',  size = '-log10(p_adj)', facet.by='comparison', sorting='none', xlab = "",  ylab = "") +
scale_color_gradientn(colours = dichromat::colorschemes$BluetoGreen.14) +  
theme_pubr() + theme(legend.position='right', axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)))
dev.off() 