# Random Background Generation

In [4]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"

### imports ###
import sys
import os
import pandas as pd
import numpy as np
import matplotlib
import itertools
import scipy
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn import preprocessing
import sklearn
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
import matplotlib_venn
from sklearn.cross_validation import train_test_split
from random import shuffle
import threading
import time
from collections import Counter
### notebook specific configuration ###
%matplotlib inline
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(5000)
os.chdir('/gpfs/data01/glasslab/home/jtao/analysis/random_background_analysis/')
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2
%env PATH=/gpfs/data01/glasslab/home/jtao/perl5/bin:/gpfs/data01/glasslab/home/jtao/software/anaconda3/bin:/home/jtao/software/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/bin:/gpfs/data01/glasslab/home/jtao/software/homer/bin:/gpfs/data01/glasslab/home/jtao/software/weblogo:/home/jtao/code/seq_merge_pipe:/home/vlink/mouse_strains/marge/shifting:/bioinformatics/glassutils/scripts:/bioinformatics/software/meme/bin:/home/jtao/software/lsgkm/bin

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: PATH=/gpfs/data01/glasslab/home/jtao/perl5/bin:/gpfs/data01/glasslab/home/jtao/software/anaconda3/bin:/home/jtao/software/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/bin:/gpfs/data01/glasslab/home/jtao/software/homer/bin:/gpfs/data01/glasslab/home/jtao/software/weblogo:/home/jtao/code/seq_merge_pipe:/home/vlink/mouse_strains/marge/shifting:/bioinformatics/glassutils/scripts:/bioinformatics/software/meme/bin:/home/jtao/software/lsgkm/bin


## Copy in Data

In [None]:
%%bash
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_score_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_sequence_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_strand_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_start_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_end_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/summary_frame.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/annotation_frame.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/peak_sequences/C57BL6J.fa ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_count_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_summedScore_frame_C57BL6J.pickle ./

In [7]:
%%bash
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_score_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_sequence_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_strand_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_start_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_end_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/peak_sequences/BALBCJ.fa ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_count_frame_BALBCJ.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_summedScore_frame_BALBCJ.pickle ./

## Read in Score Files

### C57Bl6 Data

In [2]:
summary_frame = pd.read_pickle('summary_frame.pickle')


## Create background peaks from genomic sequences from each chromosome

In [3]:
def getRandomBackground(target_positions, 
                        size_ratio = 1.0, 
                        tolerance = 0.01, 
                        N_threshold = 0.5 ):
    '''
    target_sequences: 2D numpy array, list of genomic coordinates for target sequences [[chr,start,end],...]
    size_ratio: float, number of background sequences relative to target sequences
    tolerance: float, max difference in GC content between True and background labelled samples
    *** Uses mm10 genome taken from Homer ***
    '''
    
    ###load mm10 genome into memory
    
    # index target positions
    # {chr:[]}, value is chromosome length boolean array
    # largest chromosome has 200 million bps 
    _chromosomes = ['chr1' , 'chr2' , 'chr3' , 'chr4' , 'chr5' , 
                    'chr6' , 'chr7' , 'chr8' , 'chr9' , 'chr10', 
                    'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 
                    'chr16', 'chr17', 'chr18', 'chr19', 'chrX']
    _chrom_size_dict = {}
    _chrom_seq_dict = {}
    for chrom in _chromosomes:
        with open('./mm10_genome/' + chrom + '.fa') as f:
            data = f.readlines()
        seq = ''.join(x.upper().strip() for x in data[1:])
        size = len(seq)
        _chrom_size_dict[chrom] = size
        _chrom_seq_dict[chrom] = seq
    _numChromosomes = len(_chromosomes)
    
    target_chr_position_dict = {x:np.zeros(200000000) for x in _chromosomes} 
    ### initialize target_chr_position_dict using target positions
    ### retreive target sequences
    target_sequences = []
    for pos in target_positions:
        chrom = pos[0]        
        start = pos[1]
        end = pos[2]
        target_chr_position_dict[chrom][start-1:end] = 1 # use 0 indexing of position, versus 1 indexing used in fasta
        seq = _chrom_seq_dict[chrom][start:(end)]
        target_sequences.append(seq)
    ### calculate GC content and average length of the target sequences
    target_gc_count = 0
    target_length_count = 0
    for s in target_sequences:
        target_gc_count += s.count('G')
        target_gc_count += s.count('C')
        target_length_count += len(s)
    target_gc_content = target_gc_count/(target_length_count+0.0000001) # GC content of target sequences
    mean_target_length = target_length_count/len(target_sequences) # average length of target sequences
    mean_target_length = int(np.floor(mean_target_length))
    ### select random genomic loci such that they do no overlap target sequences
    numSelected = 0
    numToSelect = len(target_positions) * size_ratio # candidate pool of background seqs is size_ratio X larger
    candidate_positions = []
    numNallowed = int(N_threshold * mean_target_length) # number of allowable Ns
    counter = 0
    while numSelected < numToSelect:
        if counter % 100000 == 0:
            print(counter, numSelected)
        # select random chromsome
        chromIndex = np.random.randint(_numChromosomes)
        randChrom = _chromosomes[chromIndex]
        randChromSize = _chrom_size_dict[randChrom]
        # must find non overlapping segment on this chromosome before moving on
        selectedSequence = False
        while not selectedSequence:
            counter += 1
            randStart = np.random.randint(randChromSize)
            randEnd = randStart + mean_target_length
            overlap_sum = np.sum(target_chr_position_dict[randChrom][randStart:(randEnd + 1)])
            
            if not overlap_sum > 0:
                randSeq = _chrom_seq_dict[randChrom][randStart:(randEnd+1)]
                numN = randSeq.count('N')
                if numN <= numNallowed:
                    rand_gc_count = randSeq.count('G')+ randSeq.count('C')
                    rand_gc = rand_gc_count/mean_target_length
                    if abs(target_gc_content - rand_gc) <= tolerance:
                        selectedSequence = True
                        numSelected+=1
                        candidate_positions.append([randChrom, randStart, randEnd, randSeq])
    # calcuate GC content of background samples
    background_gc_count = 0
    background_length = 0
    for cp in candidate_positions:
        s = cp[3]
        background_gc_count += s.count('G')
        background_gc_count += s.count('C')
        background_length += len(s)
    background_gc_content = background_gc_count/(background_length+0.0000001)
    print('target GC:', target_gc_content, 
          'background GC:', background_gc_content, 
          'target length:', mean_target_length,
          'numTargetPositions',len(target_positions),
          'backgroundPositions', len(candidate_positions))
    return candidate_positions

In [12]:
# generate random genomic background for all monomers
import time
strain = 'c57bl6'
ap1_members = ['atf3','cjun', 'fos', 'junb','jund']    
for treatment in ['veh', 'kla']:
    for monomer in ap1_members:
        target_indices = summary_frame[summary_frame[strain + '_' + monomer + '_' + treatment] > 0.0].index.values
        target_positions = summary_frame[summary_frame.index.isin(target_indices)][['chr', 'start', 'end']].values
        start = time.time()
        backgroundPositions = getRandomBackground(target_positions, 
                                                  N_threshold =1.0, 
                                                  tolerance=0.05, 
                                                  size_ratio=5)
        end = time.time()
        print(monomer, treatment, end - start)
        pickle.dump(backgroundPositions,open('./background_pickles/' + monomer + '_' + treatment + '_background.pickle', 'wb'))

0 0
target GC: 0.4925952895419081 background GC: 0.4817145044010695 target length: 200 numTargetPositions 23140 backgroundPositions 115700
atf3 veh 35.86300587654114
0 0
100000 31875
target GC: 0.4936398738657412 background GC: 0.4817101185622151 target length: 200 numTargetPositions 15539 backgroundPositions 77695
cjun veh 30.41100239753723
0 0
target GC: 0.5973989569748387 background GC: 0.573518327527866 target length: 200 numTargetPositions 767 backgroundPositions 3835
fos veh 27.855791807174683
0 0
target GC: 0.49717002237080854 background GC: 0.48624439324618823 target length: 200 numTargetPositions 447 backgroundPositions 2235
junb veh 27.6730477809906
0 0
target GC: 0.5091324503311129 background GC: 0.4939045475627441 target length: 200 numTargetPositions 19630 backgroundPositions 98150
jund veh 32.09514307975769
0 0
100000 34722
200000 69353
target GC: 0.4801782310331616 background GC: 0.4734733283097056 target length: 200 numTargetPositions 36722 backgroundPositions 183610
at

### Use Background positions to create peak files and Merge Peak Files

In [13]:
strain = 'c57bl6'
! if [ ! -d ./background_peak_files ]; then mkdir ./background_peak_files; fi
ap1_members = ['atf3','cjun', 'fos', 'junb','jund']    

for treatment in ['veh', 'kla']:
    for monomer in ap1_members:
        backgroundPositions = pickle.load(open('./background_pickles/' + monomer + '_' + treatment + '_background.pickle', 'rb'))
        outFile = open('./background_peak_files/' + strain + '_' + monomer + '_' + treatment + '-background_peaks.tsv' , 'w')
        outFile.write('\t'.join(['#PeakID','chr','start','end','strand','idrScore', 'count','\n']))
        counter = 0
        for pos in backgroundPositions:
            chrom = pos[0]
            start = str(pos[1])
            end = str(pos[2])
            strand = '+' # arbitrary - for compatibility with downstream scripts
            score = '1' # arbitrary - for compatibility with downstream scripts
            randID = 'background_' + str(np.random.randint(10000)) + '_' + str(counter)
            counter += 1
            outFile.write('\t'.join([randID, chrom, start, end, strand, score, score, '\n']))
        outFile.close()
        
    

In [14]:
%%capture
! mergePeaks -d 0 ./background_peak_files/*tsv > ./background_merged_peaks.tsv


In [None]:
# no peaks will be merged and so merged peak files will have peak center for both start/end coordinates
merged_frame = pd.read_csv('./background_merged_peaks.tsv', sep='\t', low_memory=False)
sizes = [x[2]-x[1] for x in backgroundPositions]
mean_peak_size = np.mean(sizes)
adj_distance = int(mean_peak_size/2)
merged_frame['start'] = merged_frame['start'] - adj_distance
merged_frame['end'] = merged_frame['end'] + adj_distance
merged_frame.to_csv('./background_merged_fixed_peaks.tsv', sep='\t', index=False)

In [None]:
! makeSummaryFile.py ./background_merged_fixed_peaks.tsv ./background_group_summary.tsv ./background_peak_files/*

In [5]:
# read in peak data data
summary_background_frame = pd.read_csv('./background_group_summary.tsv' , sep='\t')
summary_background_frame = summary_background_frame.fillna('0')
for col in summary_background_frame.columns[5:]:
    floatValues = []
    for val in summary_background_frame[col].values.astype(str):
        if ',' in val:
            maxVal = np.mean([float(x) for x in val.split(',')])
            floatValues.append(maxVal)
        else:
            floatValues.append(float(val))
    summary_background_frame[col] = floatValues
summary_background_frame.index = summary_background_frame['ID'].values

  interactivity=interactivity, compiler=compiler, result=result)


### create a script to scan for motifs using FIMO



In [23]:
%%bash
if [ ! -d ./peak_sequences ] ;
    then mkdir ./peak_sequences
else
    rm ./peak_sequences/*
fi

In [24]:
%%bash
perl /home/vlink/mouse_strains/marge/analysis/extract_seq_from_peakfiles.pl -strains C57BL6J -file ./background_merged_fixed_peaks.tsv -output ./peak_sequences/C57BL6J_marge.fa





Saving peaks
Loading shift vectors


In [25]:
# reformat fastq files to use homer peak IDs

coordinate_peakID_dict = {} # {chr_start_end:homerID}
with open ('./background_merged_fixed_peaks.tsv') as f:
    data = f.readlines()
for line in data[1:]:
    tokens = line.split('\t')
    coordinate = '_'.join(tokens[1:4])
    peakID = tokens[0].strip()
    coordinate_peakID_dict[coordinate] = tokens[0]



In [26]:
for fastaFile in os.listdir('./peak_sequences/'):
    if 'marge' in fastaFile:
        strain = fastaFile.split('_')[0]
        outFile = open('./peak_sequences/' + fastaFile.replace('_marge',''), 'w')
        print(fastaFile)
        with open('./peak_sequences/' + fastaFile) as f:
            data = f.readlines()
        for line in data:
            if '>' in line:
                coordinate = line[1:].replace('_'+strain,'').strip()
                
                peakID = coordinate_peakID_dict[coordinate]
                outFile.write('>' + peakID + '\n')
               
            else:
                outFile.write(line)
        outFile.close()

C57BL6J_marge.fa


In [27]:
!cp ./peak_sequences/C57BL6J.fa ./C57BL6J_background.fa

In [28]:
# create a script to scan for motifs using FIMO
! if [ ! -d /home/jtao/analysis/random_background_analysis/fimo_results/ ]; then mkdir /home/jtao/analysis/random_background_analysis/fimo_results/; fi
! if [ ! -d /home/jtao/analysis/random_background_analysis/fimo_out/ ]; then mkdir /home/jtao/analysis/random_background_analysis/fimo_out/; fi
! rm -rf ./fimo_out/*
! rm -rf ./fimo_result/*


pthresh = 0.01
motif_dir = '/home/jtao/analysis/cobinding_motif_analysis/fimo_motifs/'

fimo_results_dir = './fimo_results'

for fastaFile in os.listdir('./peak_sequences/'):
    if not 'marge' in fastaFile:
        print(fastaFile)
        strain = fastaFile.split('.')[0]
        count = 0
        scriptFile = open('scanMotifs_background_'+ strain +'.sh','w')
        for m in os.listdir(motif_dir):
            if 'fimo' in m:
                fimo_out_dir = './fimo_out/' + strain + '_' +m.replace('.fimo','')
                outPath = fimo_results_dir + '/' +strain + '_' + m.replace('.fimo','') +'.txt'
                scriptFile.write(
                    '(sleep ' + str(15 * count) + 
                    's; fimo --text --max-stored-scores 2000000 --output-pthresh ' + 
                    str(pthresh) +' --oc ' + fimo_out_dir + ' ' +
                    motif_dir + '/' + m + ' ./peak_sequences/' + fastaFile +
                    '> ' + outPath + ' ) & \n')
                count+=1
        scriptFile.close()




C57BL6J.fa


In [None]:
%%bash
chmod a+x ./scanMotifs*.sh
for i in ./scanMotifs*sh; 
    do echo 'sleeping...';
    echo $i;
    $i;
#     sleep 5m;
done



### Read in Motif Scores

In [6]:
def read_fimo_file(thread_lock, motif_results_path, 
                   all_peak_ids,
                   motif_score_dict,
                   motif_sequence_dict,
                   motif_strand_dict,
                   motif_start_dict,
                   motif_end_dict,
                   motif_count_dict,
                  ):
    
    # read in fimo result as data frame
    fimo_result_frame=pd.read_csv(motif_results_path, 
                                  skiprows=1,
                                  names=['motif_name', 
                                         'peak_id', 
                                         'start', 
                                         'stop', 
                                         'strand', 
                                         'score', 
                                         'pvalue', 
                                         'sequence'],
                                  sep='\t')
    motif_name = fimo_result_frame['motif_name'].values[0]
    print('reading', motif_name)
    
    id_values_dict = {} # {PeakID:(motifScore, motifSequence, motifStrand, motifStart, motifEnd)}
    # drop all motif instances that has less than the maximum score
    sorted_fimo_result_frame = fimo_result_frame.sort_values(by='score', ascending=False)
    top_fimo_result_frame = sorted_fimo_result_frame.drop_duplicates(subset='peak_id')
    
    # convert data frame to a dictionary 
    unique_peak_ids = top_fimo_result_frame['peak_id'].values
    scores = top_fimo_result_frame['score'].values
    strands = top_fimo_result_frame['strand'].values
    sequences = top_fimo_result_frame['sequence'].values
    starts = top_fimo_result_frame['start'].values
    ends = top_fimo_result_frame['stop'].values
    
    for i in range(len(unique_peak_ids)):
        currentPeakID = unique_peak_ids[i]
        currentScore = float(scores[i])
        currentSequence = sequences[i]
        currentStrand = strands[i]
        currentStart = int(starts[i])
        currentEnd = int(ends[i])
        # bundle values
        
        if currentScore < 0.0:
            currentScore = 0.0
        
        newValues = (currentScore, 
                     currentSequence, 
                     currentStrand, 
                     currentStart, 
                     currentEnd, 
                     )
        id_values_dict[currentPeakID] = newValues
    
    # sort values according to all peak IDs
    sorted_values = [id_values_dict[x] if x in id_values_dict else (0,'','?',-1,-1) for x in  all_peak_ids]
    sorted_scores = [x[0] for x in sorted_values]
    sorted_sequences = [x[1] for x in sorted_values]
    sorted_strands = [x[2] for x in sorted_values]
    sorted_starts = [x[3] for x in sorted_values]
    sorted_ends = [x[4] for x in sorted_values]
    
    # count the number of motif instances
    peak_ids = fimo_result_frame['peak_id'].values
    id_count_dict = Counter(peak_ids)
    
    sorted_counts = [id_count_dict[x] if x in id_count_dict else 0 for x in all_peak_ids]
    
    while thread_lock.locked_lock():
        time.sleep(1)
    thread_lock.acquire()
        
    motif_score_dict[motif_name] = sorted_scores
    motif_sequence_dict[motif_name] = sorted_sequences
    motif_strand_dict[motif_name] = sorted_strands
    motif_start_dict[motif_name] = sorted_starts
    motif_end_dict[motif_name] = sorted_ends
    motif_count_dict[motif_name] = sorted_counts
    # release lock
    print('finished reading', motif_name )
    if thread_lock.locked_lock():
        thread_lock.release()
    
    return None

In [None]:
start = time.time()
fimo_result_path = './fimo_results/'
for fastaFile in os.listdir('./peak_sequences/'):
    if not 'marge' in fastaFile:
        print(fastaFile)
        strain = fastaFile.split('.')[0]

        motif_score_dict = {}
        motif_sequence_dict ={}
        motif_strand_dict = {}
        motif_start_dict = {}
        motif_end_dict = {}
        motif_count_dict = {}
        thread_list = []
        all_peak_ids = summary_background_frame.index.values
        lock = threading.Lock()
        for m in sorted(os.listdir('./fimo_results')):
            try:
                current_thread = threading.Thread(target = read_fimo_file,
                                 args =(lock,
                                        fimo_result_path + '/' + m, 
                                        all_peak_ids,
                                        motif_score_dict,
                                        motif_sequence_dict,
                                        motif_strand_dict,
                                        motif_start_dict,
                                        motif_end_dict,
                                        motif_count_dict,
                                        ))
                thread_list.append(current_thread)
                current_thread.start()
            finally:
                if lock.locked_lock():
                    lock.release()
        for current_thread in thread_list:
            current_thread.join()

        # convert dictionaries to data frames
        motif_score_background_frame = pd.DataFrame(motif_score_dict , index = summary_background_frame['ID'].values)
        motif_sequence_background_frame = pd.DataFrame(motif_sequence_dict , index = summary_background_frame['ID'].values)
        motif_strand_background_frame = pd.DataFrame(motif_strand_dict , index = summary_background_frame['ID'].values)
        motif_start_background_frame = pd.DataFrame(motif_start_dict , index = summary_background_frame['ID'].values)
        motif_end_background_frame = pd.DataFrame(motif_end_dict , index = summary_background_frame['ID'].values)
        motif_count_background_frame = pd.DataFrame(motif_count_dict , index = summary_background_frame['ID'].values)

        for frame in [motif_score_background_frame, 
                      motif_sequence_background_frame, 
                      motif_strand_background_frame, 
                      motif_start_background_frame, 
                      motif_end_background_frame, 
                      motif_count_background_frame]:
            motif_cols = frame.columns.values
            frame['ID'] = summary_background_frame['ID'].values
            frame['Factors'] = summary_background_frame['Factors'].values
            frame['chr'] = summary_background_frame['chr'].values

        motif_score_background_frame.to_pickle('motif_score_background_frame_'+  strain + '.pickle3')

        motif_sequence_background_frame.to_pickle('motif_sequence_background_frame_'+  strain + '.pickle3')

        motif_strand_background_frame.to_pickle('motif_strand_background_frame_'+  strain + '.pickle3')

        motif_start_background_frame.to_pickle('motif_start_background_frame_'+  strain + '.pickle3')

        motif_end_background_frame.to_pickle('motif_end_background_frame_'+  strain + '.pickle3')

        motif_count_background_frame.to_pickle('motif_count_background_frame_'+  strain + '.pickle3')

end = time.time()
print('total time', end - start)


C57BL6J.fa
reading gmeb2
reading cux
reading arid5a
reading hnf1
reading homeobox-4
reading e2f2
reading crem
reading homeobox-2
reading hoxa11
reading gsc
reading dbp_hlf_tef
reading creb3-l1
reading duxa
reading alx1_alx4_arx
reading arid3a
reading arid3b
reading mef2a-b-d
reading hoxc13
reading lhx3
reading pax3_pax7
reading homeobox-3
