# MPRA analysis of CMPRA data

In [1]:
import warnings
warnings.filterwarnings('ignore')
import polars as pl
import polars.selectors as cs
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
from scipy import stats
pl.Config.set_fmt_str_lengths(50)
warnings.filterwarnings('ignore')

## Read files

In [2]:
config = "testbinorientations"
file = "../../mpra_capture_flow/results/CMPRA5/mpralm/"+config+"_mpralm_output.tsv"
mpralm = pl.read_csv(file, separator="\t")
tss_file = "/data/humangen_kircherlab/MPRA/CaptureCMPRA/resources/TSS_pos_v44.bed.gz"
ccres_file = "../../resources/GRCh38-cCREs.bed"
baits_file= "/data/humangen_kircherlab/MPRA/CaptureCMPRA/mpra_capture_flow/resources/Arnould_3X_Mod_1_Covered.bed"
tss = pl.read_csv("/data/humangen_kircherlab/MPRA/CaptureCMPRA/resources/TSS_pos_v44.bed.gz", separator="\t", has_header=False, new_columns=["chr","start","end","gene_id", "?", "orientation"]
				  ).select(pl.exclude("?"))
targets_file = "/data/humangen_kircherlab/MPRA/CaptureCMPRA/mpra_capture_flow/resources/target_regions.tsv"
digest_file = "/data/humangen_kircherlab/MPRA/CaptureCMPRA/mpra_capture_flow/resources/Digest_hg38_DpnII.txt"
digest = pl.read_csv(digest_file, skip_rows=1, separator="\t").select(~cs.matches("RE1|Restr"))
baits = pl.read_csv("/data/humangen_kircherlab/MPRA/CaptureCMPRA/mpra_capture_flow/resources/Arnould_3X_Mod_1_Covered.bed", separator="\t", has_header=False, new_columns=["chr","start","end","probe"])
nr_reads = pl.read_csv("../../mpra_capture_flow/results/CMPRA5/binned/"+config+"_readsperbin.tsv", separator="\t")
#short_enhancers = pl.read_csv("../../resources/short_enhancers.bed.gz", separator="\t", has_header=False)
#ccres = pl.read_csv("../../resources/GRCh38-cCREs.bed", separator="\t", has_header=False, new_columns=["chr","start","end","?", "enhancer_id", "description"])

In [3]:
mpralm.height

566

## Add features 

### Nr of reads per bin

In [4]:
with_nr_reads = mpralm.join(nr_reads, on="bin")

In [5]:
with_nr_reads = with_nr_reads.with_columns(pl.col("bin").str.split("::").list.to_struct()).unnest("bin").rename({
		"field_0": "left_bin", 
		"field_1": "right_bin"
		}) #.cast({cs.matches(".*_start|.*_end"): pl.Int32}, strict=False)
bins = pl.concat(with_nr_reads.select(cs.matches("_bin"))).str.split("-").list.to_struct().struct.unnest().rename({
		"field_0": "chr", 
		"field_1": "start", 
		"field_2": "end"
		}).cast({cs.matches("start|end"): pl.Int32}, strict=False).filter(pl.col("chr").str.contains("chr")).sort("chr", "start", "end").unique()
bins.write_csv("../../resources/temp.single_bins.tsv", separator="\t", include_header=False)
bin_file = "../../resources/temp.single_bins.tsv"


### TSS locations

In [6]:
!bedtools intersect -b $tss_file -a  $bin_file -wa | sort -k 1,1 -k2,2n | uniq > ../../resources/temp.tss_bins.tsv
#awk '{print $1"-"$4}' 
#awk 'print $ | uniq > ../../resources/temp.tss_bins.tsv


In [7]:
tss_bins = pl.read_csv("../../resources/temp.tss_bins.tsv", has_header=False, new_columns=["chr", "start", "end"], separator="\t").with_columns(tss=pl.lit("yes"))
tss_bins = tss_bins.select("tss", bin=pl.concat_str(["chr",  "start", "end"], separator="-"))

In [8]:
tss_data = with_nr_reads.join(tss_bins.rename({"tss": "tss_left"}), left_on="left_bin", right_on="bin",  how="left"
		  ).unique().join(tss_bins.rename({"tss": "tss_right"}), left_on="right_bin", right_on="bin",  how="left"
		  ).unique()
tss_data = tss_data.with_columns(cs.matches("tss").fill_null("no"))
tss_data = tss_data.with_columns(pl.when(pl.any_horizontal(cs.matches("tss") == "yes")).then(pl.lit("yes")).otherwise(pl.lit("no")).alias("any_tss"))

### Encode CCREs

In [9]:
!bedtools intersect -b $ccres_file -a $bin_file -wo | cut -f -3,8-9  > ../../resources/temp.ccres_bins.tsv # | sort -k1,1 -k2,2n| bedtools merge -i stdin -c 4,5 -o distinct

In [10]:
ccres_bins = pl.read_csv("../../resources/temp.ccres_bins.tsv", has_header=False, new_columns=["chr", "start", "end", "eh_id", "description"], separator="\t")
ccres_bins = ccres_bins.select("eh_id", "description", bin=pl.concat_str(["chr", "start", "end"], separator="-")).group_by("bin").agg("eh_id", "description"
								).with_columns(eh_id = pl.col("eh_id").list.unique().list.join(","), description = pl.col("description").list.unique().list.join(","))

In [11]:
data_ccres = tss_data.join(ccres_bins.rename({"description": "screen_left"}), left_on="left_bin", right_on="bin",  how="left"
		  ).unique().join(ccres_bins.rename({"description": "screen_right"}), left_on="right_bin", right_on="bin",  how="left"
		  ).unique()
data_ccres = data_ccres.with_columns(cs.matches("screen").fill_null("none"))
data_ccres.height

566

### Baited regions

In [12]:
!bedtools intersect -b $baits_file -a $bin_file -wo |  awk '$8 > 20' | cut -f -3,7 > ../../resources/temp.bait_bins.tsv # | sort -k1,1 -k2,2n | bedtools merge -i stdin -c 4 -o distinct 

In [13]:
bait_bins = pl.read_csv("../../resources/temp.bait_bins.tsv", has_header=False, new_columns=["chr", "start", "end", "bait"], separator="\t")
bait_bins = bait_bins.select("bait", bin=pl.concat_str(["chr",  "start", "end"], separator="-")).group_by("bin").agg("bait")
data_baits = data_ccres.join(bait_bins.rename({"bait": "bait_left"}), left_on="left_bin", right_on="bin",  how="left"
		  ).join(bait_bins.rename({"bait": "bait_right"}), left_on="right_bin", right_on="bin",  how="left"
		  )
data_baits = data_baits.with_columns(cs.matches("bait").fill_null(["none"]).list.unique().list.join(","))
data_baits = data_ccres


### Target regions and their labels

In [14]:
!bedtools intersect -b <(paste <(cut -f 8-10 $targets_file) <(cut -f 1-3 $targets_file) | tail -n +2) -a $bin_file -wo | cut -f -3,7-9 > ../../resources/temp.target_bins.tsv # | sort -k1,1 -k2,2n | bedtools merge -i stdin -c 4,5 -o distinct

In [15]:
target_bins = pl.read_csv("../../resources/temp.target_bins.tsv", has_header=False, new_columns=["chr", "start", "end", "target_gene", "label"], separator="\t")
target_bins = target_bins.select("target_gene", bin=pl.concat_str(["chr",  "start", "end"], separator="-"), targeted="label").group_by("bin").agg(
	"targeted", "target_gene").with_columns(
		pl.col('targeted').list.unique().list.join(","), 
		pl.col('target_gene').list.unique().list.join(","))
data_labeled = data_baits.join(target_bins.rename({"targeted": "targeted_left", "target_gene": "target_gene_left"}), left_on="left_bin", right_on="bin",  how="left"
		  ).join(target_bins.rename({"targeted": "targeted_right", "target_gene": "target_gene_right"}), left_on="right_bin", right_on="bin",  how="left"
		  )
data_labeled = data_labeled.with_columns(cs.matches("target").fill_null("unlabeled"))
data_labeled.height

566

In [16]:
data_labeled = data_labeled.with_columns(pl.when(pl.all_horizontal(cs.matches("targeted").str.contains("positive"))).then(pl.lit("positive - positive"))
						 .when(pl.all_horizontal(cs.matches("targeted").str.contains("negative"))).then(pl.lit("negative - negative"))
						 .when(pl.all_horizontal(cs.matches("targeted").str.contains("target"))).then(pl.lit("target - target"))
						 .when(pl.all_horizontal(cs.matches("targeted").str.contains("unlabeled"))).then(pl.lit("other - other"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("positive")) & pl.any_horizontal(cs.matches("targeted").str.contains("negative"))).then(pl.lit("positive - negative"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("positive")) & pl.any_horizontal(cs.matches("targeted").str.contains("target"))).then(pl.lit("positive - target"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("negative")) & pl.any_horizontal(cs.matches("targeted").str.contains("target"))).then(pl.lit("negative - target"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("unlabeled")) & pl.any_horizontal(cs.matches("targeted").str.contains("target"))).then(pl.lit("target - other"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("unlabeled")) & pl.any_horizontal(cs.matches("targeted").str.contains("positive"))).then(pl.lit("positive - other"))
						 .when(pl.any_horizontal(cs.matches("targeted").str.contains("unlabeled")) & pl.any_horizontal(cs.matches("targeted").str.contains("negative"))).then(pl.lit("negative - other"))
						 .alias("label"))

In [17]:
data = data_labeled.with_columns(pl.when(pl.all_horizontal(cs.matches("screen").str.contains("PLS"))).then(pl.lit("PLS - PLS"))
						 .when(pl.any_horizontal(cs.matches("screen").str.contains("ELS")) & pl.any_horizontal(cs.matches("screen").str.contains("PLS"))).then(pl.lit("PLS - ELS"))
						 .when(pl.all_horizontal(cs.matches("screen").str.contains("ELS"))).then(pl.lit("ELS - ELS"))
						 .when(pl.all_horizontal(pl.any_horizontal(cs.matches("screen").str.contains("PLS")) & pl.any_horizontal(cs.matches("screen").str.contains("ELS|PLS").not_())))
			 			.then(pl.lit("PLS - undefined"))
						.when(pl.all_horizontal(pl.any_horizontal(cs.matches("screen").str.contains("ELS")) & pl.any_horizontal(cs.matches("screen").str.contains("ELS|PLS").not_())))
						.then(pl.lit("ELS - undefined"))
						 .when(pl.all_horizontal(cs.matches("screen").str.contains("PLS|ELS").not_())).then(pl.lit("undefined"))
						 .alias("interaction"))


In [18]:
data = data.select("logFC", "adj.P.Val", "P.Value", "left_bin", "right_bin", "nr_reads", "any_tss", 
"screen_left", "screen_right", "targeted_left", "targeted_right", "target_gene_left", "target_gene_right", "label", "interaction")

### Distance between prom and enh

In [19]:
plotting_data = data.filter(pl.col("right_bin").str.contains("null").not_())
le20mb = plotting_data.with_columns(chr1 = pl.col("left_bin").str.split("-").list.get(0),
				   start1 = pl.col("left_bin").str.split("-").list.get(1).cast(pl.Int32),
				   end1 = pl.col("left_bin").str.split("-").list.get(2).cast(pl.Int32),
				   chr2 = pl.col("right_bin").str.split("-").list.get(0),
				   start2 = pl.col("right_bin").str.split("-").list.get(1).cast(pl.Int32),
				   end2 = pl.col("right_bin").str.split("-").list.get(2).cast(pl.Int32)).with_columns( 
				   dist = np.abs((pl.col("end1")+pl.col("start1"))/2 - (pl.col("end2") + pl.col("start2"))/2))
le20mb = le20mb.filter((pl.col("chr1") == pl.col("chr2")) & 
		       (pl.col("dist") <= 2000000))
le20mb = le20mb.select(~cs.matches("1|2"))
data = pl.concat([le20mb,
				 data.filter(pl.col("right_bin").str.contains("null")).with_columns(dist = None)
				 ])

### Activity relative to promoter only

In [20]:
baited_interactions = data.filter(pl.col("label") != "other - other")
baited_interactions = baited_interactions.with_columns(
	target_genes = pl.concat_str(pl.col("target_gene_left"), pl.col("target_gene_right"),
	separator=",").str.replace("unlabeled,", "").str.replace(",unlabeled", ""))
baited_interactions = baited_interactions.select(pl.exclude("target_gene_left", "target_gene_right"))
baited_interactions = baited_interactions.filter(pl.col("target_genes").str.contains(",").not_())

In [21]:
# giving promoter and other end labels.
baited_interactions = baited_interactions.with_columns(
	pl.when(pl.col("targeted_left") == "unlabeled")
	.then(pl.col("right_bin")).otherwise(pl.col("left_bin")).alias("promoter"), 
	pl.when(pl.col("targeted_left") == "unlabeled")
	.then(pl.col("left_bin")).otherwise(pl.col("right_bin")).alias("OE"))

In [22]:
prom_only_logfcs = baited_interactions.filter(
	pl.col("right_bin").str.contains("null")).select("promoter", promoter_only = "logFC")
prom_only_logfcs = prom_only_logfcs.group_by("promoter").median()
relative_activity = baited_interactions.join(prom_only_logfcs, on="promoter").filter(pl.col("right_bin").str.contains("null").not_())
#relative_activity = relative_activity.with_columns(std = pl.col("logFC").std().over("promoter"))
relative_activity = relative_activity.with_columns(std = np.sqrt((pl.col("logFC")- pl.col("promoter_only"))
																 .pow(2).sum().over("promoter")/(pl.col("OE").count().over("promoter")-1)))

In [23]:
sign_changes = relative_activity.with_columns(
	pl.when(pl.col("logFC") > pl.col("promoter_only") + 2*pl.col("std"))
	.then(pl.lit("upregulating"))
	.when(pl.col("logFC") < pl.col("promoter_only") - 2*pl.col("std"))
	.then(pl.lit("downregulating"))
	.otherwise(pl.lit("no effect")).alias("effect"), 
	z_score = (pl.col("logFC") - pl.col("promoter_only"))/pl.col("std"))

In [24]:
data = data.join(sign_changes, on=["logFC", "adj.P.Val", "P.Value", "left_bin", "right_bin", "nr_reads", "any_tss",
								 "screen_left", "screen_right", "targeted_left", "targeted_right", "label", "interaction", "dist"], how="left")


### Enhancers between 250 and 270 bp for testing with minimal promoter

In [18]:
short_enhancers = short_enhancers.select(bin = pl.concat_str(["column_1", "column_2", "column_3"], separator="-"), short_enhancer=pl.lit("yes"))

NameError: name 'short_enhancers' is not defined

In [19]:
data.height

259267

In [20]:
effect_short_enhancers = pl.concat([data.join(short_enhancers, left_on="left_bin", right_on="bin"), data.join(short_enhancers, left_on="right_bin", right_on="bin")]
).select("logFC", "adj.P.Val", "left_bin", "right_bin", "screen_left", "screen_right", "interaction", "short_enhancer").filter((pl.col("logFC") > 1) | (pl.col("logFC")< -1))

In [21]:
effect_short_enhancers.height

750

## Write to file

In [25]:
data.write_csv("../../results/MPRA_analysis/CMPRA5/labeled_data_"+config+".tsv", separator="\t")

In [26]:
data.head()

logFC,adj.P.Val,P.Value,left_bin,right_bin,nr_reads,any_tss,screen_left,screen_right,targeted_left,targeted_right,target_gene_left,target_gene_right,label,interaction,dist,target_genes,promoter,OE,promoter_only,std,effect,z_score
f64,f64,f64,str,str,i64,str,str,str,str,str,str,str,str,str,f64,str,str,str,f64,f64,str,f64
-1.024541,0.000103,4.1e-05,"""chr1_KI270711v1_random-30444-30696""","""chr1_KI270711v1_random-24033-24439""",5,"""no""","""none""","""none""","""unlabeled""","""unlabeled""","""unlabeled""","""unlabeled""","""other - other""","""undefined""",6334.0,,,,,,,
0.806423,0.000407,0.000185,"""chr1-12618708-12619501""","""chr1-12618115-12618644""",5,"""yes""","""pELS,CTCF-bound,PLS,CTCF-bound""","""PLS,CTCF-bound""","""target""","""target""","""DHRS3""","""DHRS3""","""target - target""","""PLS - PLS""",725.0,,,,,,,
0.046924,0.813128,0.782962,"""chr21-29073301-29073793""","""chr21-29073794-29074184""",11,"""yes""","""PLS,CTCF-bound""","""PLS,CTCF-bound,pELS,CTCF-bound""","""target""","""target""","""CCT8""","""CCT8""","""target - target""","""PLS - PLS""",442.0,,,,,,,
0.22131,0.2063,0.161537,"""chr8-120445458-120445652""","""chr8-120444710-120445493""",11,"""yes""","""PLS,CTCF-bound""","""pELS,CTCF-bound,PLS,CTCF-bound""","""target""","""target""","""MRPL13""","""MRPL13""","""target - target""","""PLS - PLS""",453.5,,,,,,,
-0.197629,0.455196,0.403725,"""chr20-45691412-45691692""","""chr20-45683896-45684648""",5,"""no""","""none""","""none""","""unlabeled""","""unlabeled""","""unlabeled""","""unlabeled""","""other - other""","""undefined""",7280.0,,,,,,,
