## Infer causal Structure on ScanPy Data

#### Structure:
A: Load Data from file & look at structure

B: Algorithms
1. GRNBoost2
2. GIES
3. DCDI

Dependencies:
 use a conda-env with:
 - scanpy python-igraph leidenalg

 GRNBoost:
 - conda install -c bioconda arboreto
 
 GIES:
 - pip install gies

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

import scp_infer as scpi



Current dir:  /home/jans/Documents/Masterarbeit/code/scp-infer
['/home/jans/Documents/Masterarbeit/code/scp-infer', '/home/jans/miniconda3/envs/py-infer/lib/python312.zip', '/home/jans/miniconda3/envs/py-infer/lib/python3.12', '/home/jans/miniconda3/envs/py-infer/lib/python3.12/lib-dynload', '', '/home/jans/miniconda3/envs/py-infer/lib/python3.12/site-packages', '/home/jans/Documents/Masterarbeit/code/scp-infer/algorithm_implementations']
PyTorch not installed. Please install it to use DCDI.


In [2]:
results_file = '../data/edited/Schraivogel_chr8-sc-scaled-20genes.h5ad'  # the file that will store the analysis results

1. Read File

In [3]:
adata = sc.read_h5ad(results_file)
adata

AnnData object with n_obs × n_vars = 3638 × 20
    obs: 'replicate', 'tissue_type', 'cell_line', 'cancer', 'disease', 'celltype', 'organism', 'perturbation', 'perturbation_type', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'n_genes_by_counts', 'total_counts', 'n_genes', 'total_counts_mt', 'pct_counts_mt', 'non-targeting', 'multiplet', 'control', 'nan', 'gene_perturbation_mask'
    var: 'ncounts', 'ncells', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'mt', 'gene_perturbed', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'log1p'
    layers: 'perturbed_elem_mask'

Check what count distribution looks like:

In [4]:
#1st step: extract data matrix, gene names and cell names from the AnnData object
gene_names = adata.var_names
cell_names = adata.obs_names

#print("Data matrix shape: ", df.shape)
#print("sample: ", df.iloc[0:3,0:3])
print(len(gene_names),"genes: ", [i for i in gene_names[:3]])
print(len(cell_names),"cells: ", [i for i in cell_names[:1]])

#2nd step: extract metadata from the AnnData object and exctract perturbation information
metadata = adata.obs
metadata.head()

# Look at more perturbation labels
# print(adata.obs['perturbation'].astype(str).copy()[1000:1020])

20 genes:  ['CCNE2', 'CPQ', 'CROPseq_dCas9_DS_chr8:103754850-103755402_5_+']
3638 cells:  ['TGATTGACAAACCTGAGAGCTATA-sample_14']


Unnamed: 0_level_0,replicate,tissue_type,cell_line,cancer,disease,celltype,organism,perturbation,perturbation_type,ncounts,...,n_genes_by_counts,total_counts,n_genes,total_counts_mt,pct_counts_mt,non-targeting,multiplet,control,nan,gene_perturbation_mask
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TGATTGACAAACCTGAGAGCTATA-sample_14,sample_14,cell_line,K562,True,chronic myelogenous leukemia,lymphoblasts,human,RIPK2,CRISPR,1247.0,...,61,1247.0,61,0.0,0.0,False,False,False,False,True
TGATTGACAAACCTGAGTCGAGTG-sample_14,sample_14,cell_line,K562,True,chronic myelogenous leukemia,lymphoblasts,human,DSCC1,CRISPR,2615.0,...,65,2615.0,65,0.0,0.0,False,False,False,False,True
TGATTGACAAACCTGCAACTTGAC-sample_14,sample_14,cell_line,K562,True,chronic myelogenous leukemia,lymphoblasts,human,OXR1,CRISPR,1445.0,...,63,1445.0,63,0.0,0.0,False,False,False,False,True
TGATTGACAAACCTGCAGTATCTG-sample_14,sample_14,cell_line,K562,True,chronic myelogenous leukemia,lymphoblasts,human,non-targeting,CRISPR,1711.0,...,72,1711.0,72,0.0,0.0,True,False,False,False,False
TGATTGACAAACCTGCATGCAATC-sample_14,sample_14,cell_line,K562,True,chronic myelogenous leukemia,lymphoblasts,human,STK3,CRISPR,974.0,...,60,974.0,60,0.0,0.0,False,False,False,False,True


In [5]:
# print([i for i in adata.var['mean'][0:10]])
# print([i for i in adata.var['std'][0:10]])
# print corresponding perturbation labels
print('Perturbations: ', [i for i in adata.obs['perturbation'][:10]])

scpi.adata.print_expression_mean_std(adata)

Perturbations:  ['RIPK2', 'DSCC1', 'OXR1', 'non-targeting', 'STK3', 'FAM83A', 'non-targeting', 'non-targeting', 'RIPK2', 'non-targeting']

Perturbed Gene Expression:
Mean:  -1.4113744075647467
Std:  1.4492590032606343
Min:  -4.547554016113281
Max:  2.776451587677002
95% percentile:  -3.999375104904175  -  0.5078186392784119

Non-Target Gene Expression:
Mean:  -7.267663181836982e-05
Std:  0.6138410054692522
Min:  -4.547995567321777
Max:  2.779264450073242
95% percentile:  -0.8693741917610159  -  0.73392196893692


# B. UMAP Visualization

In [9]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
pert_genes = adata.var_names[adata.var["gene_perturbed"]]

In [None]:
sc.tl.umap(adata)
sc.pl.umap(adata)

In [None]:
sc.pl.umap(adata, color = ['replicate','nperts','perturbation'])

## UMAP - only perturbed cells

In [None]:
indices =adata.obs['gene_perturbation_mask'] | adata.obs['non-targeting']
adata_pert = adata[indices,:].copy()
adata_pert

In [None]:
sc.pp.neighbors(adata_pert, n_neighbors=10, n_pcs=40)

In [None]:
sc.tl.umap(adata_pert)
sc.pl.umap(adata_pert)

In [None]:
sc.pl.umap(adata_pert, color = ['replicate','nperts','perturbation'])
sc.pl.umap(adata_pert, color = ['percent_ribo','ncounts','ngenes'])

In [None]:
# Filtered by replicate
adata_sample = adata[adata.obs['replicate']=='sample_14',:].copy()
pert_genes = adata_sample.var_names[adata_sample.var["gene_perturbed"]]
sc.pp.neighbors(adata_sample, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata_sample)
sc.pl.umap(adata_sample)

In [None]:
sc.pl.umap(adata_sample, color = pert_genes)