In [12]:
import pandas as pd
from pybedtools.bedtool import BedTool as pbt
import numpy as np
from scipy import stats

### 1. Prep borders

In [207]:
border_plus = pd.read_feather("/tank/projects/diana_hic/tads/boundaries_plus_ctcf_compartments_annotated.feather")
border_minus = pd.read_feather("/tank/projects/diana_hic/tads/boundaries_minus_ctcf_compartments_annotated.feather")
boundaries_minus_sploped = pbt.from_dataframe(border_minus.iloc[:, [0,1,2, 5]]).slop(b=15000, g="/tank/projects/diana_hic/chrom.sizes")
boundaries_plus_sploped = pbt.from_dataframe(border_plus.iloc[:, [0,1,2, 5]]).slop(b=15000, g="/tank/projects/diana_hic/chrom.sizes")
border_minus.shape[0], border_plus.shape[0]

(5278, 5302)

### 2. Prep genes for the intersection

In [219]:
def make_tss(pc_DE_genes_minus):
    chrom_sizes_path = "/tank/projects/diana_hic/chrom.sizes"
    tss = pbt.from_dataframe(pc_DE_genes_minus).flank(g=chrom_sizes_path, s=True, l=1, r=0)
    tss = pd.read_table(tss.fn, header = None, names=pc_DE_genes_minus.columns.tolist())
    return tss
    
def prep_genes(path_gene, make_tss_from_genes = True):
    pc_DE_genes_plus  = pd.read_csv(path_gene, header = None, sep="\t")
    pc_DE_genes_plus.columns = ["chrom2", "start2", "end2", "gene", "log2FC", "strand"]
    if make_tss_from_genes:
        pc_DE_genes_plus = make_tss(pc_DE_genes_plus)
    return pc_DE_genes_plus

In [126]:
def prep_full_annotation():
    genes_strand = pd.read_csv("/tank/projects/diana_hic/rna-seq/rnaseq_results/GENCODE.v41.annotation.bed", sep = "\t", header = None)
    genes_strand.columns = ["chr", "start", "end", "gene.id", "smth", "strand"]
    genes_ensembl = pd.read_csv("/tank/projects/diana_hic/rna-seq/rnaseq_results/rnaseq.rizzardi2019.counts.GENCODE.autosomes_and_chrX.with_meta.tsv", sep = "\t")
    gene_full_annotation = genes_ensembl[["gene.id","gene.name", "gene.type"]].merge(genes_strand, on = "gene.id")
    return gene_full_annotation

In [216]:
def prep_expressed_rizzardi(make_tss_from_genes = True):
    gene_full_annotation = prep_full_annotation()
    rizzardi_all_genes_with_coordinates_expressed_short = pd.read_csv("/tank/projects/diana_hic/rna-seq/rnaseq_results/rizzardi_all_genes_with_coordinates_expressed_short.csv")
    rizzardi_all_genes_with_coordinates_expressed_short = rizzardi_all_genes_with_coordinates_expressed_short.merge(gene_full_annotation[["gene.name","strand"]].drop_duplicates().reset_index(drop=True))
    if make_tss_from_genes:
        rizzardi_all_genes_with_coordinates_expressed_short = make_tss(rizzardi_all_genes_with_coordinates_expressed_short)
    return rizzardi_all_genes_with_coordinates_expressed_short

In [230]:
pc_DE_genes_plus = prep_genes('/tank/projects/diana_hic/rna-seq/rnaseq_results/rizzardi.pc_DE_genes.upr_in_neu.bed', make_tss_from_genes=True)
pc_DE_genes_minus = prep_genes('/tank/projects/diana_hic/rna-seq/rnaseq_results/rizzardi.pc_DE_genes.upr_in_gli.bed', make_tss_from_genes=True)

pc_DE_genes_plus_genes = prep_genes('/tank/projects/diana_hic/rna-seq/rnaseq_results/rizzardi.pc_DE_genes.upr_in_neu.bed', make_tss_from_genes=False)
pc_DE_genes_minus_genes = prep_genes('/tank/projects/diana_hic/rna-seq/rnaseq_results/rizzardi.pc_DE_genes.upr_in_gli.bed', make_tss_from_genes=False)

rizzardi_expressed = prep_expressed_rizzardi()
rizzardi_expressed_genes = prep_expressed_rizzardi(make_tss_from_genes = False)

In [224]:
def prep_de_genes(make_tss_from_genes = True):
    if make_tss_from_genes:
        rizz_file = rizzardi_expressed
        minus_de_file  =pc_DE_genes_minus
        plus_de_file  = pc_DE_genes_plus
    else:
        rizz_file = rizzardi_expressed_genes
        minus_de_file  =pc_DE_genes_minus_genes
        plus_de_file  =pc_DE_genes_plus_genes
       
    expressed_NOT_de_minus = rizz_file[~rizz_file["gene.name"].isin(pc_DE_genes_minus.gene.to_list())]
    expressed_NOT_de_plus = rizz_file[~rizz_file["gene.name"].isin(pc_DE_genes_plus.gene.to_list())]
    
    expressed_NOT_de_minus.columns = ["chrom2", "start2", "end2", "gene", "type", "strand"]
    expressed_NOT_de_plus.columns = ["chrom2", "start2", "end2", "gene", "type", "strand"]
    expressed_NOT_de_minus = expressed_NOT_de_minus[expressed_NOT_de_minus.type == "protein_coding"]
    expressed_NOT_de_plus = expressed_NOT_de_plus[expressed_NOT_de_plus.type == "protein_coding"]
    return expressed_NOT_de_plus, expressed_NOT_de_minus

In [231]:
expressed_NOT_de_plus_genes, expressed_NOT_de_minus_genes = prep_de_genes(make_tss_from_genes = False)
print(expressed_NOT_de_plus_genes.shape[0], expressed_NOT_de_minus_genes.shape[0])
expressed_NOT_de_plus, expressed_NOT_de_minus = prep_de_genes(make_tss_from_genes = True)
print(expressed_NOT_de_plus.shape[0], expressed_NOT_de_minus.shape[0])

13198 13162
13198 13162


### Intersection

#### A. All borders

In [173]:
def make_a_b(expressed_not_de, border_file_init):  
    border_file_slopped = pbt.from_dataframe(border_file_init.iloc[:, [0,1,2, 5]]).slop(b=15000, g="/tank/projects/diana_hic/chrom.sizes")

    b_noDE = border_file_slopped\
                  .intersect(pbt.from_dataframe(expressed_not_de), wa=True, wb=True)
    b_noDE = pd.read_table(b_noDE.fn, header = None, names=border_file_init.iloc[:, [0,1,2, 5]].columns.tolist()+expressed_not_de.columns.tolist())
    b = b_noDE.gene.nunique()
    a = expressed_not_de.gene.nunique() - b
    return_list_up_border = b_noDE.gene.unique().tolist()
    return a, b, return_list_up_border

In [174]:
def make_analysis(expressed_NOT_de_plus,pc_DE_genes_plus,  border_plus):
    a, b, smth = make_a_b(expressed_NOT_de_plus, border_plus)
    c, d, list_up_border = make_a_b(pc_DE_genes_plus, border_plus)
    table = np.array([[   a,  b],
                      [  c, d]  ])
    oddsratio, pvalue = stats.fisher_exact(table)
    print("OddsR: ", oddsratio, "p-Value:", pvalue)
    return list_up_border, table

In [135]:
a, b = make_a_b(expressed_NOT_de_plus, border_plus)
c, d = make_a_b(pc_DE_genes_plus, border_plus)
table = np.array([[   a,  b],
                  [  c, d]  ])
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

OddsR:  1.0847889701932285 p-Value: 0.08782128700718764


In [136]:
a, b = make_a_b(expressed_NOT_de_minus, border_minus)
c, d = make_a_b(pc_DE_genes_minus, border_minus)
table = np.array([[   a,  b],
                  [  c, d]  ])
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

OddsR:  0.8240541054241616 p-Value: 0.0003195781577038139


##### All DE in borders

##### Genes

In [238]:
a, b, de_genes_in_border_plus = make_a_b(pc_DE_genes_plus_genes, border_plus)
a, b, de_genes_in_border_minus = make_a_b(pc_DE_genes_minus_genes, border_minus)
de_genes_in_border = {}
de_genes_in_border["plus"] = {}
de_genes_in_border["plus"] = de_genes_in_border_plus
de_genes_in_border["minus"] = {}
de_genes_in_border["minus"] = de_genes_in_border_minus
with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_in_border_basedOnGene.json", 'w') as f:
    json.dump(de_genes_in_border, f)


##### TSS

In [186]:
a, b, de_genes_in_border_plus = make_a_b(pc_DE_genes_plus, border_plus)
a, b, de_genes_in_border_minus = make_a_b(pc_DE_genes_minus, border_minus)
de_genes_in_border = {}
de_genes_in_border["plus"] = {}
de_genes_in_border["plus"] = de_genes_in_border_plus
de_genes_in_border["minus"] = {}
de_genes_in_border["minus"] = de_genes_in_border_minus
with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_in_border.json", 'w') as f:
    json.dump(de_genes_in_border, f)


#### All expressed

In [197]:
def make_all_expressed(border_file_init, rizzardi_all_genes_with_coordinates_expressed_short):
    border_file_slopped = pbt.from_dataframe(border_file_init.iloc[:, [0,1,2, 5]]).slop(b=15000, g="/tank/projects/diana_hic/chrom.sizes")
    b_noDE = pbt.from_dataframe(rizzardi_all_genes_with_coordinates_expressed_short)\
                      .intersect(border_file_slopped, wa=True, wb=False)
    b_noDE = pd.read_table(b_noDE.fn, header = None, names = rizzardi_all_genes_with_coordinates_expressed_short.columns.tolist())
    return b_noDE['gene.name'].unique().tolist()

In [245]:
all_expressed_minus = make_all_expressed(border_minus, rizzardi_all_genes_with_coordinates_expressed_short)
len(all_expressed_minus)
all_expressed_plus = make_all_expressed(border_plus, rizzardi_all_genes_with_coordinates_expressed_short)
len(all_expressed_minus)

all_genes_in_border = {}
all_genes_in_border["plus"] = {}
all_genes_in_border["plus"] = all_expressed_plus
all_genes_in_border["minus"] = {}
all_genes_in_border["minus"] = all_expressed_minus

with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/all_genes_in_border.json", 'w') as f:
    json.dump(all_genes_in_border, f)

In [246]:
all_expressed_minus = make_all_expressed(border_minus, rizzardi_expressed_genes)
len(all_expressed_minus)
all_expressed_plus = make_all_expressed(border_plus, rizzardi_expressed_genes)
len(all_expressed_minus)

all_genes_in_border = {}
all_genes_in_border["plus"] = {}
all_genes_in_border["plus"] = all_expressed_plus
all_genes_in_border["minus"] = {}
all_genes_in_border["minus"] = all_expressed_minus

with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/all_genes_in_border_basedOnGene.json", 'w') as f:
    json.dump(all_genes_in_border, f)

#### 2. Selected groups

In [138]:
boundaries_plus = pd.read_feather("/tank/projects/diana_hic/tads/boundaries_plus_ctcf_compartments_annotated.feather")
boundaries_minus = pd.read_feather("/tank/projects/diana_hic/tads/boundaries_minus_ctcf_compartments_annotated.feather")

In [148]:
boundaries_plus.in_compartment_border.unique().tolist()

['not in border', 'in border']

In [149]:
boundaries_plus.ctcf_peak.unique().tolist()

['has_ctcf_peak', 'no_ctcf_peak']

In [175]:
def make_intersect(expressed_NOT_de_plus,pc_DE_genes_plus, boundaries_plus):
    pep_dict = {}
    for border in ['not in border', 'in border']:
        for ctcf in ['has_ctcf_peak', 'no_ctcf_peak']:
            name = border+" "+ ctcf
            print(name)
            pep_dict[name] = {}
            df = boundaries_plus.loc[(boundaries_plus.in_compartment_border == border ) & (boundaries_plus.ctcf_peak == ctcf)].reset_index(drop=True).copy()
            list_up_border, table = make_analysis(expressed_NOT_de_plus,pc_DE_genes_plus,  df)
            print(table)
            print("Length",len(list_up_border))
            print("___")
            pep_dict[name] = list_up_border
    return pep_dict
            

#### Full genes

In [234]:
list_up_border_plus_genes = make_intersect(expressed_NOT_de_plus_genes, pc_DE_genes_plus_genes, boundaries_plus)

not in border has_ctcf_peak
OddsR:  1.3868601776605263 p-Value: 2.0965679208305556e-14
[[10459  2733]
 [ 2856  1035]]
Length 1035
___
not in border no_ctcf_peak
OddsR:  1.1473087171678722 p-Value: 0.1110296981170362
[[12624   568]
 [ 3700   191]]
Length 191
___
in border has_ctcf_peak
OddsR:  1.1407484753338362 p-Value: 0.14968846615009157
[[12678   514]
 [ 3719   172]]
Length 172
___
in border no_ctcf_peak
OddsR:  0.9132394683487272 p-Value: 0.7661617259089786
[[13077   115]
 [ 3860    31]]
Length 31
___


In [236]:
list_up_border_minus_genes = make_intersect(expressed_NOT_de_minus_genes,pc_DE_genes_minus_genes, boundaries_minus)

not in border has_ctcf_peak
OddsR:  1.0014939665056533 p-Value: 0.977705068930901
[[11604  1551]
 [ 3541   474]]
Length 474
___
not in border no_ctcf_peak
OddsR:  0.8943158099277122 p-Value: 0.09144747528266259
[[11972  1183]
 [ 3689   326]]
Length 326
___
in border has_ctcf_peak
OddsR:  0.9743751092466352 p-Value: 0.7743731335213555
[[12480   675]
 [ 3814   201]]
Length 201
___
in border no_ctcf_peak
OddsR:  0.826741255144033 p-Value: 0.06085308818598399
[[12655   500]
 [ 3888   127]]
Length 127
___


In [237]:
with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_up_border_plus_basedOnGene.json", 'w') as f:
    json.dump(list_up_border_plus_genes, f)

with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_up_border_minus_basedOnGene.json", 'w') as f:
    json.dump(list_up_border_minus_genes, f)

#### TSS

In [241]:
for i in list_up_border_minus.keys():
    print(len(list_up_border_minus[i]))

219
128
92
50


In [176]:
list_up_border_plus = make_intersect(expressed_NOT_de_plus,pc_DE_genes_plus, boundaries_plus)

not in border has_ctcf_peak
OddsR:  1.1472664070627836 p-Value: 0.011725873965054721
[[11604  1588]
 [ 3363   528]]
Length 528
___
not in border no_ctcf_peak
OddsR:  0.8898000387200266 p-Value: 0.3702740338310667
[[12869   323]
 [ 3806    85]]
Length 85
___
in border has_ctcf_peak
OddsR:  1.0414464256561882 p-Value: 0.7568010182810392
[[12902   290]
 [ 3802    89]]
Length 89
___
in border no_ctcf_peak
OddsR:  0.644692421954603 p-Value: 0.2130441446625559
[[13129    63]
 [ 3879    12]]
Length 12
___


In [179]:
list_up_border_minus = make_intersect(expressed_NOT_de_minus,pc_DE_genes_minus, boundaries_minus)

not in border has_ctcf_peak
OddsR:  0.8735252477583766 p-Value: 0.08809145713633293
[[12340   815]
 [ 3796   219]]
Length 219
___
not in border no_ctcf_peak
OddsR:  0.7590218931618967 p-Value: 0.005338995355280991
[[12608   547]
 [ 3887   128]]
Length 128
___
in border has_ctcf_peak
OddsR:  0.9057765602302148 p-Value: 0.4500553558520558
[[12823   332]
 [ 3923    92]]
Length 92
___
in border no_ctcf_peak
OddsR:  0.8127082117784344 p-Value: 0.2021717823568288
[[12954   201]
 [ 3965    50]]
Length 50
___


In [180]:
import json
with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_up_border_plus.json", 'w') as f:
    json.dump(list_up_border_plus, f)

with open("/tank/projects/diana_hic/hic_project_notebooks/temp_files/de_genes_up_border_minus.json", 'w') as f:
    json.dump(list_up_border_minus, f)