# task-perturb-multiomics-grn
## Creating resources
### by Jalil Nourisa

# Multiomics

In [6]:
import anndata as ad
import pandas as pd
import numpy as np
work_dir = '../output/'

resource_dir = '../resources/'

In [30]:
adata_rna = ad.read_h5ad(f'{work_dir}/scRNA/adata_rna.h5ad')
adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')



In [33]:
np.savetxt(f'{work_dir}/benchmark/multiomics_genes.txt', adata_rna.var_names, fmt='%s')

In [28]:
adata_atac.obs = adata_atac.obs[['obs_id']]
adata_atac.obs = adata_atac.obs.set_index('obs_id')

In [29]:
adata_rna.var['feature_type'] = 'GEX'
adata_atac.var['feature_type'] = 'ATAC'

In [30]:
mask = adata_atac.obs.index.isin(adata_rna.obs.index) # keep only those cells that are shared 
adata_atac = adata_atac[mask,:]

In [31]:
adata = ad.concat([adata_rna, adata_atac], axis=1)
adata

AnnData object with n_obs × n_vars = 25034 × 158136
    var: 'feature_type'

In [32]:
adata.obs = pd.merge(adata_rna.obs, adata_atac.obs, left_index=True, right_index=True, how='outer')

In [33]:
adata.obs

Unnamed: 0_level_0,cell_type,donor_id
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000225c1151ab841,B cells,donor_0
0003c40a54367871,T cells,donor_2
0004bf574b822c3c,T cells,donor_2
000d59b5478f28e2,B cells,donor_0
0011b7473923d7b5,T cells,donor_2
...,...,...
fff2ca1f64c10339,T cells,donor_0
fff87e64f509b570,T cells,donor_0
fff9778b31bc2539,Myeloid cells,donor_2
fffa92f71d2440de,T cells,donor_1


In [34]:
adata.write(f'{resource_dir}/multiomics.h5ad')

In [41]:
adata_atac.obs = adata.obs

In [42]:
adata_atac.write_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')

# Benchmark

## Gene names

In [9]:
bulk_adata = ad.read_h5ad(f'{work_dir}/preprocess/bulk_adata_integrated.h5ad')

In [12]:
np.savetxt(f'{work_dir}/benchmark/perturb_gene_names.txt', bulk_adata.var_names.values  , fmt='%s')

## HVGs

In [5]:
n_hvgs = 3000

In [1]:
%load_ext rpy2.ipython

In [4]:
%%R -i work_dir
library(scry)
library(zellkonverter)
library(SingleCellExperiment)
options(digits=5, max.print=100)  # Adjust numbers as needed



adata = readH5AD(paste0(work_dir, "/preprocess/bulk_adata_f.h5ad")) # raw counts
sce = devianceFeatureSelection(adata, assay="X", batch=colData(adata)$plate_name)
writeH5AD(sce, paste0(work_dir, "/preprocess/adata_sce.h5ad"))


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

  openrlib.rlib.R_tryEval(


Registered S3 methods overwritten by 'zellkonverter':
  method                                             from      
  py_to_r.numpy.ndarray                              reticulate
  py_to_r.pandas.core.arrays.categorical.Categorical reticulate
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    r

In [7]:
adata_sce = ad.read(f"{work_dir}/preprocess/adata_sce.h5ad")
binomial_deviance  = adata_sce.var['binomial_deviance']
indices = binomial_deviance.argsort()[-n_hvgs:]
mask = np.zeros(adata_sce.var.shape[0], dtype=bool)
mask[indices] = True
hvgs_sce = adata_sce[:, mask].var.index.values



In [8]:
np.savetxt( f'{work_dir}/benchmark/hvgs.txt',hvgs_sce, fmt='%s')

## Gene annotation

In [84]:
# load the annotations and subset it for genes in multiomics data. also, remove
from local_utils import annotation

multiomics_genes =  np.loadtxt(f'{work_dir}/benchmark/multiomics_genes.txt', dtype=str)
annot_database = annotation.ensembl_gene_annotation()

transcript_types = ['protein_coding', 'lncRNA', 'miRNA']

annot_database_f = annot_database[annot_database.Gene.isin(multiomics_genes)].reset_index()

annot_database_f = annot_database_f[['Gene', 'Transcript_type']].drop_duplicates()
annot_database_f = annot_database_f[annot_database_f.Transcript_type.isin(transcript_types)]
rename_map = {'protein_coding':'Protein coding', 'lncRNA':'LncRNA',  'miRNA':'MicroRNA'}
annot_database_f.Transcript_type = annot_database_f.Transcript_type.map(rename_map)
annot_database_f.reset_index(drop=True, inplace=True)
annot_database_f.head()

Unnamed: 0,Gene,Transcript_type
0,AC007325.4,Protein coding
1,AC107375.1,LncRNA
2,AC022306.3,LncRNA
3,ALDOC,Protein coding
4,HEMK1,Protein coding


In [85]:
 #only keep genes with one annotation
annot_size = annot_database_f.groupby('Gene').size()
kept_genes = annot_size[~(annot_size>1)].index
annot_database_f = annot_database_f[annot_database_f.Gene.isin(kept_genes)].reset_index(drop=True)
annot_database_f.shape

(22139, 2)

In [86]:
annot_database_f.Transcript_type.value_counts()

Transcript_type
Protein coding    15282
LncRNA             6856
MicroRNA              1
Name: count, dtype: int64

In [87]:
#save 
annot_database_f.to_csv(f'{work_dir}/benchmark/gene_annotation.csv')

## peak annotation

In [13]:
adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')




In [14]:

## check if all the peaks in grns given in atac data
import re

def format_peak(peaks):
    formatted_peaks = []
    for peak in peaks:
        chr_, start, end = re.split(r'[:\-_]', peak)
        peak = f"{chr_}:{start}-{end}"

        formatted_peaks.append(peak)
    return formatted_peaks


atac_peaks = format_peak(adata_atac.var_names)



In [16]:
peaks = pd.DataFrame({'chr':[peak.split(':')[0] for peak in atac_peaks],
                                     'range':[peak.split(':')[1] for peak in atac_peaks]})
peaks

Unnamed: 0,chr,range
0,chr10,100001032-100001800
1,chr10,100006075-100006963
2,chr10,100009475-100010367
3,chr10,100013993-100014884
4,chr10,100020278-100021136
...,...,...
135353,chrY,7765105-7765991
135354,chrY,7814158-7815060
135355,chrY,7818681-7819599
135356,chrY,8535565-8536421


In [17]:
%%R -i peaks -o peaks_annotated_df
options(digits=5, max.print=100)  # Adjust numbers as needed
set.seed(123)

# install.packages("IRanges")
# install.packages("GenomicRanges")
# install.packages("ggplot2")
# install.packages("TxDb.Hsapiens.UCSC.hg38.knownGene")

library(IRanges)
library(GenomicRanges)
library(ggplot2)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene


peaks = GRanges(peaks$chr, IRanges(peaks$range))
peaks_annotated = suppressMessages(ChIPseeker::annotatePeak(
    peaks,
    tssRegion = c(-1000, 1000), # extended from -5kb to 5
    TxDb = txdb,
    level = "transcript", 
    assignGenomicAnnotation = TRUE,  # the default
    genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
                                "Downstream", "Intergenic"),  # the default
    annoDb = NULL,
    sameStrand = FALSE, # the default
    ignoreOverlap = FALSE, # the default
    ignoreUpstream = FALSE, # the default
    ignoreDownstream = FALSE, # the default
    overlap = "TSS", # the default
    verbose = TRUE # the default
))
peaks_annotated_df = as.data.frame(peaks_annotated)
# write.table(peaks_annotated_df, paste0(temp_dir, name, '_annot.txt'), sep=',', row.names = FALSE)



>> preparing features information...		 2024-06-18 14:17:45 
>> identifying nearest features...		 2024-06-18 14:17:46 
>> calculating distance from peak to TSS...	 2024-06-18 14:17:48 
>> assigning genomic annotation...		 2024-06-18 14:17:48 
>> assigning chromosome lengths			 2024-06-18 14:18:10 
>> done...					 2024-06-18 14:18:10 


Loading required package: GenomicFeatures
Loading required package: AnnotationDbi


In [19]:
peaks_annotated_df.head()

Unnamed: 0,seqnames,start,end,width,strand,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,transcriptId,distanceToTSS
1,chr10,100001032,100001800,769,*,"Intron (ENST00000324109.9/23268, intron 1 of 16)",10,99875577,100009947,134371,2,23268,ENST00000324109.9,8147.0
2,chr10,100006075,100006963,889,*,"Intron (ENST00000324109.9/23268, intron 1 of 16)",10,99875577,100009947,134371,2,23268,ENST00000324109.9,2984.0
3,chr10,100009475,100010367,893,*,Promoter,10,99875577,100009947,134371,2,23268,ENST00000324109.9,0.0
4,chr10,100013993,100014884,892,*,Distal Intergenic,10,99875577,100009947,134371,2,23268,ENST00000324109.9,-4046.0
5,chr10,100020278,100021136,859,*,Distal Intergenic,10,99875577,100009947,134371,2,23268,ENST00000324109.9,-10331.0


In [29]:
map_={'Intron':'Intron', 'Exon':'Exon', 'Promoter':'Promoter', 'Distal':'Distal Intergenic', "3'":"3' UTR", 'Downstream':'Downstream (<=300)', "5'":"5' UTR"}

ann = peaks_annotated_df.annotation.str.split(' ', expand=True)[0]
ann = ann.map(map_)
peaks = peaks_annotated_df['seqnames'].astype(str)+':'+peaks_annotated_df['start'].astype(str) +'-' + peaks_annotated_df['end'].astype(str)
df = pd.DataFrame({'annotation':ann, 'peak':peaks})
df.to_csv(f'{resource_dir}/benchmark/peak_annotation.csv')
