In [1]:
# to get negatives bed files (neg_bed) that does not overlap positives, use bedtools
# bedtools subtract -A -a gw_all_chrms.bed -b peaks.bed > neg_bed.bed
# where peaks.bed contains peaks (positive regions)
# gw_all_chrms contains regions from the genome (e.g. a complete list of 500bp regions covering the whole genome)

In [2]:
!mkdir gr_a549_2hr
!bedtools subtract -A -a ../data/genome/hg38/gw_all_chrms.bed.gz -b /oak/stanford/groups/akundaje/amr1/pho4/data/a549_GR/tseries/2hr/2hr.bed.gz > gr_a549_2hr/negs.bed

In [3]:
genome_file = "../data/genome/hg38/hg38.genome.fa"
chrmsizes_file = "../data/genome/hg38/hg38.chrom.sizes"

bed_path = "/oak/stanford/groups/akundaje/amr1/pho4/data/a549_GR/tseries/2hr/2hr.bed.gz"
neg_bed_path = "gr_a549_2hr/negs.bed"
output_path = "gr_a549_2hr/matched.bed"

num_jitters = 9

In [4]:
import pysam
import pandas as pd
import numpy as np


# compute gc content of main set
ref=pysam.FastaFile(genome_file)
data=pd.read_csv(bed_path,header=None,sep='\t',index_col=[0,1,2])
bed_entries=[i for i in data.index]
gc_fracts = []
for entry in bed_entries:
    seq=ref.fetch(entry[0],entry[1],entry[2]).upper()
    gc_fract=(seq.count('G')+seq.count('C'))/float(len(seq))
    gc_fracts.append(gc_fract)

In [5]:
# search in negative set to get GC matched negatives
neg_data=pd.read_csv(neg_bed_path,header=None,sep='\t',index_col=[0,1,2])
neg_bed_entries=[i for i in neg_data.index]
neg_gc_fracts = []
for entry in neg_bed_entries:
    seq=ref.fetch(entry[0],entry[1],entry[2]).upper()
    gc_fract=(seq.count('G')+seq.count('C'))/float(len(seq))
    neg_gc_fracts.append(gc_fract)
neg_gc_fracts = np.array(neg_gc_fracts)

neg_set = []
for gc_fract in gc_fracts:
    curr_fracts = np.abs(neg_gc_fracts-gc_fract)
    chosen_id = np.argmin(curr_fracts)
    neg_set.append(neg_bed_entries[chosen_id])

  mask |= (ar1 == a)


In [6]:
# augment with additional positives by jittering until desired pos_to_neg ratio is achieved
def _read_chromsizes():
    chrom_to_size = {}
    for row in open(chrmsizes_file):
        chrom,chromlen = row.rstrip().split("\t")
        chromlen = int(chromlen)
        chrom_to_size[chrom] = chromlen
    return chrom_to_size

rng = np.random.RandomState(1234)
maxshift = 100
chromsizes = _read_chromsizes()

a_list = []
for chrom, start, end in bed_entries:
    for c in range(num_jitters):
        shift_size = int(rng.uniform(low=0, high=(2*maxshift + 1)) - maxshift)
        shift_size = max(-start, shift_size)
        shift_size = min(chromsizes[chrom]-end, shift_size)
        start = start + shift_size
        end = end + shift_size
        a_list.append((chrom, start, end))

In [7]:
# write
file1 = open(output_path, "w")
for chrom, start, end in bed_entries: file1.write(chrom+"\t"+str(start)+"\t"+str(end)+"\t.\n")
for chrom, start, end in neg_set: file1.write(chrom+"\t"+str(start)+"\t"+str(end)+"\t.\n")
for chrom, start, end in a_list: file1.write(chrom+"\t"+str(start)+"\t"+str(end)+"\t.\n")
file1.close()

In [8]:
# cat pho4.pbexo.matched.bed | egrep -w 'chrX|chrXI' | gzip -c > test_1000_around_summits.bed.gz
# cat pho4.pbexo.matched.bed | egrep -w 'chrXIV|chrVIII' | gzip -c > valid_1000_around_summits.bed.gz
# cat pho4.pbexo.matched.bed | egrep -w -v 'chrX|chrXI|chrXIV|chrVIII' | gzip -c > train_1000_around_summits.bed.gz

In [9]:
!cat gr_a549_2hr/matched.bed | egrep -w 'chr1|chr8|chr21' | gzip -c > gr_a549_2hr/test_1000_around_summits.bed.gz
!cat gr_a549_2hr/matched.bed | egrep -w 'chr22' | gzip -c > gr_a549_2hr/valid_1000_around_summits.bed.gz
!cat gr_a549_2hr/matched.bed | egrep -w -v 'chr1|chr8|chr21|chr22' | gzip -c > gr_a549_2hr/train_1000_around_summits.bed.gz