### Protein analysis FL+FBM+CB - Endo - EXCLUDING CB as well 
#### Mariana Quiroga Londono 

In [1]:
#SETTING ENVIRONMENT 

import rpy2
#import tzlocal
#%load_ext rpy2.ipython

import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, dpi_save=1000)

  numba.core.entrypoints.init_all()


In [2]:
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list(name='gene_cmap', colors=['lightgrey', 'thistle', 'red', 'darkred']) 

# Read protein (usually called DSB_OUTPUT0)

In [3]:
pwd

'/rds/project/bg200/rds-bg200-hphi-gottgens/users/mq224/project_citeseq/part2citeseq/PhD/Year_1/Collaborations/Haniffa_Lab/Comparison_HSPCs_CD34+_fromFBM_CB_and_FL/ADT'

In [4]:
adata =sc.read("20210419_mq224_sinusoidalEC-endo_postQC_postDSB_postFDG_postHarmony_ADTonmRNA_FBMS2-3-H3-F3-E5_CB-G7_FL-A7-A6-C5.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 1469 × 198
    obs: 'Sample', 'batch', 'Tissue', 'assignment', 'leiden', 'louvain', 'FBM10X_prediction'
    var: 'bg_mean'
    uns: 'Sample_colors', 'Tissue_colors', 'assignment_colors', 'draw_graph', 'leiden', 'leiden_colors', 'louvain', 'neighbors', 'pca', 'umap'
    obsm: 'X_draw_graph_fa', 'X_orig_pca', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

# DEP

In [6]:
adata.X = np.where(adata.X < 0, 0, adata.X)

In [7]:
adata.obs["Celltype_Tissue"]=adata.obs['FBM10X_prediction'].astype(str)+"_"+adata.obs["Tissue"].astype(str)

In [8]:
np.unique(adata.obs["Celltype_Tissue"])

array(['sinusoidal EC_ref_train_FBM', 'sinusoidal EC_ref_train_FL'],
      dtype=object)

In [9]:
#calculate degs using wilcoxon rank sum test with benjamini-hochberg correction. 
#based on ln transformed count data 
sc.tl.rank_genes_groups(adata, groupby="Celltype_Tissue", method='wilcoxon', use_raw=False, log_transformed=False, groups = ['sinusoidal EC_ref_train_FBM', 'sinusoidal EC_ref_train_FL'])

ranking genes


... storing 'Celltype_Tissue' as categorical


    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)


In [10]:
# filter the degs for those which are expressed in at least 25% of cells in cluster. 
#log2fc of the ln-transformed
# data will be shown.
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.25, min_fold_change = 0.25, use_raw=False)

Filtering genes using: min_in_group_fraction: 0.25 min_fold_change: 0.25, max_out_group_fraction: 0.5


In [11]:
# save df for unfiltered degs
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
degs_by_cluster = pd.DataFrame({group + '_' + key[:7]: result[key][group]
    for group in groups for key in ['names', 'logfoldchanges', 'pvals', 'pvals_adj']})

# the degs_by_cluster df shows the log2 fold change for each gene ordered by z-score underlying the computation 
# of a p-value for each gene for each group
degs_by_cluster.to_csv("./20210420_DEP_R3/20210420_mq224_DEPWilcoxon_BHcorrection_FL-FBM_mRNA_SinuEC-endo.csv")
degs_by_cluster[:10]

Unnamed: 0,sinusoidal EC_ref_train_FBM_names,sinusoidal EC_ref_train_FBM_logfold,sinusoidal EC_ref_train_FBM_pvals,sinusoidal EC_ref_train_FBM_pvals_a,sinusoidal EC_ref_train_FL_names,sinusoidal EC_ref_train_FL_logfold,sinusoidal EC_ref_train_FL_pvals,sinusoidal EC_ref_train_FL_pvals_a
0,CD105,12.314472,5.515383e-100,3.640153e-98,CLEC1B,14.064296,3.484427e-136,6.899166e-134
1,CD106,7.794829,7.682532e-77,2.535235e-75,CD47,4.903946,2.137431e-104,2.116057e-102
2,CD200,5.208693,5.590293e-76,1.581254e-74,CD146,5.05035,1.115716e-82,5.522794000000001e-81
3,CD99,4.80548,2.875838e-61,4.745132e-60,Folate,4.534304,3.618526e-79,1.432936e-77
4,CD36,2.99799,1.422587e-57,2.011945e-56,Siglec8,2.878098,3.075474e-71,7.611798e-70
5,CD141,4.847965,1.947165e-48,1.927693e-47,EGFR,5.267,4.887028e-70,1.0751460000000002e-68
6,CD9,3.281663,6.066887999999999e-44,5.222799000000001e-43,CD4,4.860181,6.443088000000001e-66,1.275732e-64
7,TSLPR,5.541441,1.689673e-35,1.1536389999999999e-34,CD164,4.506919,3.9598129999999994e-64,7.127662999999999e-63
8,CD39,2.333376,1.333158e-30,8.515012e-30,CD95,3.922485,4.230184e-58,6.442895999999999e-57
9,CD102,1.922154,7.122138e-28,4.029095e-27,CD82,3.119332,1.0183009999999999e-56,1.344157e-55


In [12]:
pwd

'/rds/project/bg200/rds-bg200-hphi-gottgens/users/mq224/project_citeseq/part2citeseq/PhD/Year_1/Collaborations/Haniffa_Lab/Comparison_HSPCs_CD34+_fromFBM_CB_and_FL/ADT'