In [1]:
import numpy as np
import sys
sys.path.append('./')
import auxiliary_bed_functions as abf
chr_list = abf.getChrList()

In [2]:
def getAllOverlappingGenes(region_array, gene_array):
    chr_list = abf.getChrList()
    overlap_genes = []
    for c in range(len(chr_list)):
        for i in range(len(region_array[c])):
            for j in range(len(gene_array[c])):
                if(region_array[c][i][1] > gene_array[c][j][2]):
                    continue
                if(region_array[c][i][2] < gene_array[c][j][1]):
                    continue
                if(gene_array[c][j][4] not in overlap_genes):
                    overlap_genes.append(gene_array[c][j][4])
    return overlap_genes

# Read a DESeq2 output file
def readDEFile(DE_file):
    DE_array = []
    DEf = open(DE_file,'r')
    for line in DEf:
        if(line[0] == '#'):
            continue
        l = line.strip().split(',')
        l[0] = l[0].strip().split('"')[1]
        DE_array.append(l)
    DEf.close()
    return DE_array

# Given a gene list and a differential expression table,
# select only the genes passing the provided thresholds
def intersectwDE(gene_list, DE_list, padj, log2FC_cut):
    # pajd is the adjusted p-value threshold 
    # FC_cut is a threshold on the absolute value of the
    # log2FC
    intersect = []
    for gene in gene_list:
        for j in range(len(DE_list)):
            if(DE_list[j][0] == gene):
                if(DE_list[j][6] == 'NA'):
                    continue
                if(float(DE_list[j][6]) < padj and abs(float(DE_list[j][2])) > log2FC_cut ):
                    intersect.append([gene,float(DE_list[j][2])])
                    break
                    
    return intersect

In [3]:
# Read in a set of genomic regions with epigenetic alterations/features
reg_regions = abf.readBed('./leiomyoma_HOSVD_final_neg_loc4.txt')#leio_TD_final_11_20_19_pos3.txt)

# Read in the regulatory regions for each canonical gene
gene_regions = abf.readBed('./gene_TSS_regulatory_regions10kb.tsv')

# Find all genes overlapping the epigenetic regions. Regions are treated
# as closed intervals here.
nearby_genes = getAllOverlappingGenes(reg_regions, gene_regions)

In [4]:
# Read in the differential expression results
gene_DE = readDEFile('./leiomyoma_paired_pt_DESeq2_results.csv')

# Read in the set of expressed genes to use for filtering
expressed_genes = np.loadtxt('./expressedGenes_final_any.txt', dtype=np.str)

# Select the genes that overlap regulatory regions and are 
# called as differentially expressed.
reg_genes = intersectwDE(nearby_genes, gene_DE, 0.01, 0.5)

In [5]:
# Print the genes that are nearby epigenetic alerations
# and differentially expressed higher in the DE file
N = 0
for i in range(0,len(reg_genes)):
    if(reg_genes[i][1] > 0.0):
        if(reg_genes[i][0] in expressed_genes):
            print(reg_genes[i][0])
            N += 1
print(N)

NTNG1
GPSM2
FAM212B
PHGDH
RGS4
LOC100506023
PRDX6
TGFB2
LBR
VSNL1
OSR1
CYP1B1
BCL11A
LRRTM1
RPRM
COL3A1
COL5A2
SATB2
FZD7
FN1
SPEG
IRS1
ITM2C
COL6A3
THRB-AS1
CTNNB1
LRRC2
PHLDB2
POPDC2
SHOX2
SMC4
LINC00578
ZMAT3
PPARGC1A
GABRA2
GABRB1
HOPX
PDGFC
GRIA2
ASB5
SPCS3
SEMA5A
PART1
PDE8B
LHFPL2
SERINC5
VCAN
PRR16
RBM24
SOX4
PRL
CDKN1A
HTR1E
FAM229B
TUBE1
CLVS2
CENPW
EYA4
HOXA10
HOXA13
SFRP4
WBSCR17
MEST
CHRM2
SCARA3
UNC5D
SFRP1
SNAI2
PLAG1
SULF1
MMP16
SLC26A7
TP53INP1
LRP12
TRPS1
AARD
TMEM215
GOLM1
C9orf47
S1PR3
GARNL3
TACR2
KCNMA1
FAS
PSD
PDCD4-AS1
ST5
TEAD1
PDE3B
LUZP2
CCND1
ANO1
PGM2L1
ME3
NCAM1
CACNA1C
PTHLH
FAM60A
HMGA2
TRHDE
TRHDE-AS1
SOCS2-AS1
CCDC60
EFNB2
COL4A1
COL4A2
PRKD1
LRFN5
FRMD6-AS2
FRMD6
RIN3
ASB2
LINC00221
FGF7
ATP8B4
CA12
MEX3B
NKD1
SALL1
CDH8
CDH11
PDP2
CDH3
ZFHX3
KCNAB3
CNTROB
MYOCD
FZD2
FAM20A
CBX8
COLEC12
GATA6
SLC24A3
KCNG1
TFAP2C
PCP4
FAM118A
GRPR
NDP
MUM1L1
COL4A6
COL4A5
CHRDL1
CAPN6
DCX
LINC00890
146


In [7]:
# Print the genes that are nearby epigenetic alerations
# and differentially expressed lower in the DE file
N = 0
for i in range(0,len(reg_genes)):
    if(reg_genes[i][1] < 0.0):
        if(reg_genes[i][0] in expressed_genes):
            print(reg_genes[i][0])
            N += 1
print(N)

PIK3R3
RGL1
MBNL1-AS1
PDLIM5
PLN
TEX15
GAS1
WEE1
LOC101054525
DCP1B
FRY
TSC22D1
12


In [8]:
# Print the genes that are nearby epigenetic alterations
# without considering differential expression
for gene in nearby_genes:
    if(gene in expressed_genes):
        print(gene)

PUM1
PIK3R3
SLC35D1
SSX2IP
PTBP2
NTNG1
GPSM2
RAP1A
FAM212B
DDX20
OLFML3
CASQ2
PHGDH
CTSK
TPM3
DDR2
RGS4
RGS5
PBX1
ATP1B1
DNM3OS
LOC100506023
PRDX6
IER5
ARPC5
RGL1
CSRP1
LMOD1
TIMM17A
PROX1
TGFB2
LYPLAL1
LBR
ENAH
ITPKB
IRF2BP2
VSNL1
OSR1
CDC42EP3
CYP1B1
BCL11A
EHBP1
MEIS1
ACTG2
LRRTM1
RNF103
HNMT
RPRM
ZAK
COL3A1
COL5A2
SATB2
FZD7
NRP2
FN1
DES
SPEG
IRS1
ITM2C
HTR2B
COPS8
COL6A3
RAMP1
THRB-AS1
THRB
CTNNB1
LRRC2
PHLDB2
CCDC80
POPDC2
DTX3L
PARP9
C3orf58
MBNL1-AS1
SHOX2
RSRC1
IFT80
SMC4
BCHE
LINC00578
ZMAT3
CLDN1
PPARGC1A
RBPJ
TLR1
GABRA2
GABRB1
HOPX
ANTXR2
PDLIM5
SYNPO2
PDGFC
GRIA2
NPY1R
HAND2
HPGD
ASB5
SPCS3
SEMA5A
PRLR
RICTOR
GPX8
PDE4D
PART1
PIK3R1
PDE8B
LHFPL2
SERINC5
VCAN
PRR16
LRRTM2
SGCD
KCNMB1
RBM24
SOX4
PRL
HIST1H2BG
HIST1H2AE
HIST1H3E
CDKN1A
BAG2
PTP4A1
FILIP1
HTR1E
PRDM1
METTL24
FAM229B
TUBE1
PLN
CLVS2
CENPW
RSPO3
EYA4
FBXO30
LOC100507557
ESR1
ZDHHC14
AP5Z1
HOXA10
HOXA11
HOXA13
SFRP4
EPDR1
WBSCR17
SEMA3C
TES
CALU
MESTIT1
MEST
CALD1
CHRM2
PTN
SCARA3
RBPMS
TEX15
UNC5D
SFRP1
SNAI2
P