# Random Background Analysis

In [4]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"

### imports ###
import sys
import os
import pandas as pd
import numpy as np
import argparse
import matplotlib
import itertools
import scipy
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy
import pickle
from sklearn import preprocessing
import sklearn
from sklearn import decomposition
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
from sklearn import ensemble
from sklearn import neighbors
import matplotlib_venn
from sklearn.cross_validation import train_test_split
from random import shuffle

### notebook specific configuration ###
%matplotlib inline
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(5000)
os.chdir('/gpfs/data01/glasslab/home/jtao/analysis/random_background_analysis/')
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Functions

In [12]:
# split data into GC content matched training and test data
def get_GC_matched_split(features, labels, test_size, tolerance = 0.01):
    '''
    feature: 2D array (samples x features)
    labels: 1D boolean array (samples x)
    test_size: fraction of data to test on
    tolerance: max difference in GC content between True and False labelled samples
    '''
    global _id_sequence_dict
    
    ### match GC content of samples labelled True with those labelled False by thowing out False samples
    # retrieve sequences using index of labels
    index_label_tuples = tuple(zip(labels.index.values, labels.values))
    
    true_sequences = [_id_sequence_dict[x[0]] for x in index_label_tuples if x[1]]
    true_ids = [x[0] for x in index_label_tuples if x[1]]
    
    false_sequences = [_id_sequence_dict[x[0]] for x in index_label_tuples if not x[1]]
    false_ids = [x[0] for x in index_label_tuples if not x[1]]
    
    # calculate GC content of True samples
    true_gc_count = 0
    true_length = 0
    for s in true_sequences:
        true_gc_count += s.count('G')
        true_gc_count += s.count('C')
        true_length += len(s)
    true_gc_content = true_gc_count/(true_length+0.0000001)
    
    # calcuate GC content of False samples
    false_gc_count = 0
    false_length = 0
    for s in false_sequences:
        false_gc_count += s.count('G')
        false_gc_count += s.count('C')
        false_length += len(s)
    false_gc_content = false_gc_count/(false_length+0.0000001)
    
    while abs(true_gc_content - false_gc_content) > tolerance:
        # remove false GC sequences until GC content matches tolerance
        selected_seq = False
        
        while not selected_seq:
            rand_index = np.random.randint(len(false_sequences))
            current_seq = false_sequences[rand_index]
            current_gc_count = current_seq.count('G')+ current_seq.count('C')
            current_length = len(current_seq)
            current_gc = current_gc_count/current_length
            if true_gc_content > false_gc_content:
                # remove sequences that would increase overall GC content of False sequences
                if current_gc < false_gc_content:
                    selected_seq = True
            else:
                # remove sequences that would decrease overall GC content of False sequences
                if current_gc > false_gc_content:
                    selected_seq = True
        false_gc_count -= current_gc_count
        false_length -= current_length
        false_gc_content = false_gc_count/false_length
        
        false_sequences.pop(rand_index)
        false_ids.pop(rand_index)
    
    filtered_ids = true_ids + false_ids
    filtered_features = features[features.index.isin(filtered_ids)]
    filtered_labels = labels[labels.index.isin(filtered_ids)]

    if test_size <= 0.5:
        training_indices, test_indices = next(iter(
                sklearn.cross_validation.StratifiedKFold(filtered_labels, int(1/test_size), shuffle=True)))
    else:
        test_indices, training_indices = next(
            iter(sklearn.cross_validation.StratifiedKFold(filtered_labels, int(1/(1-test_size)), shuffle=True)))
    training_ids = [filtered_ids[i] for i in training_indices]
    test_ids = [filtered_ids[i] for i in test_indices]
    
    training_features = filtered_features[filtered_features.index.isin(training_ids)]
    test_features = filtered_features[filtered_features.index.isin(test_ids)]
    training_labels = filtered_labels[filtered_labels.index.isin(training_ids)]
    test_labels = filtered_labels[filtered_labels.index.isin(test_ids)]
    
    return training_features, test_features, training_labels, test_labels
    

## Copy Score Files

In [6]:
%%bash
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_score_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_sequence_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_strand_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_start_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/motif_end_frame_C57BL6J.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/summary_frame.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/annotation_frame.pickle ./
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/peak_sequences/C57BL6J.fa ./

## Read in Score Files

In [8]:
motif_score_frame=pd.read_pickle('motif_score_frame_C57BL6J.pickle')
motif_sequence_frame = pd.read_pickle('motif_sequence_frame_C57BL6J.pickle')
motif_strand_frame = pd.read_pickle('motif_strand_frame_C57BL6J.pickle')
motif_start_frame = pd.read_pickle('motif_start_frame_C57BL6J.pickle')
motif_end_frame = pd.read_pickle('motif_end_frame_C57BL6J.pickle')
summary_frame = pd.read_pickle('summary_frame.pickle')
annotation_frame = pd.read_pickle('annotation_frame.pickle')

scaler = preprocessing.MinMaxScaler()
normed_motif_frame = pd.DataFrame(scaler.fit_transform(motif_score_frame.ix[:,3:]))
normed_motif_frame.columns = motif_score_frame.columns.values[3:]
normed_motif_frame.index = motif_score_frame.index.values

scaler = preprocessing.StandardScaler()
standardized_motif_frame = pd.DataFrame(scaler.fit_transform(motif_score_frame.ix[:,3:]))
standardized_motif_frame.columns = motif_score_frame.columns.values[3:]
standardized_motif_frame.index = motif_score_frame.index.values

_factors = sorted(list(set([x.split('_')[1] for x in summary_frame.columns if '_' in x])))
_factors.remove('atac')

### read in sequences as dictionary {peakID: sequence}
with open('./C57BL6J.fa') as f:
    data = f.readlines()

_id_sequence_dict = {}
for line in data:
    if line[0] == '>':
        sequenceName = line.strip()[1:]
    else:
        _id_sequence_dict[sequenceName] = line.strip().upper()    

## Run Classifier using Open Chromatin Background

In [10]:
numIterations = 5
ap1_members = ['atf3','cjun', 'fos', 'junb','jund']    
test_size = 0.5
factors = ['atf3','cjun', 'fos', 'junb','jund', 'atac', 'cebpa', 'pu1', 'p65']
# c57bl6_indices = summary_frame[summary_frame['Factors'].str.contains('c57bl6')].index.values  



In [13]:

# for monomers using all motifs
strain = 'c57bl6'
factor_auc_dict = {}
factor_precision_dict = {}
factor_coeff_dict = {}
factor_prob_dict = {}
factor_meanCoeff_dict = {}
factor_intercept_dict = {}
factor_meanIntercept_dict = {}
for treatment in ['veh', 'kla']:
    for monomer in ap1_members:
        c57bl6_indices = summary_frame[summary_frame[['c57bl6_' + x + '_' + treatment for x in factors]].sum(axis=1) > 0].index.values  
        features = standardized_motif_frame[standardized_motif_frame.index.isin(c57bl6_indices)]
        labels = summary_frame[summary_frame.index.isin(c57bl6_indices)][strain + '_' + monomer + '_' + treatment] > 0.0
        if np.sum(labels) >= 100:
            all_aucs = []
            all_coefficients = []
            all_probs = None
            all_precisions = []
            all_intercepts = []
            for i in range(numIterations):  

                # split data into training and test sets
                training_features, test_features, training_labels, test_labels = get_GC_matched_split(
                    features, labels, test_size = test_size, tolerance = 0.01)

                #  Run classifier
                lr_classifier = sklearn.linear_model.LogisticRegression(penalty='l1', n_jobs=-1)

                lr_classifier.fit(training_features, training_labels)
                # retrieve probabilities
                probas_lr = lr_classifier.predict_proba(test_features)

                # score predictions
                current_roc_auc = sklearn.metrics.roc_auc_score(test_labels, probas_lr[:, 1], average = None)
                current_precision = sklearn.metrics.average_precision_score(test_labels, probas_lr[:, 1], average = None)

                all_aucs.append(current_roc_auc)
                all_precisions.append(current_precision)

                # score all sequences
                probs = lr_classifier.predict_proba(features)[:, 1]

                current_coefficients = lr_classifier.coef_.flatten()
                all_coefficients.append(current_coefficients)
                all_intercepts.append(lr_classifier.intercept_[0])
                
                if all_probs == None:
                    all_probs = probs
                else:
                    all_probs = all_probs + probs
            mean_coefficients = np.mean(all_coefficients, axis=0)
            
            factor_auc_dict[monomer + '_' + treatment]= all_aucs
            factor_precision_dict[monomer + '_' + treatment] = all_precisions
            factor_coeff_dict[monomer + '_' + treatment] = all_coefficients
            factor_prob_dict[monomer + '_' + treatment] = all_probs
            factor_meanCoeff_dict[monomer + '_' + treatment] = mean_coefficients
            factor_intercept_dict[monomer + '_' + treatment] = all_intercepts
            factor_meanIntercept_dict[monomer + '_' + treatment] = np.mean(all_intercepts)
            print(monomer + '_' + treatment,
                  'roc:', np.mean(all_aucs), np.var(all_aucs),
                  'precision:', np.mean(all_precisions), np.var(all_precisions),  
                  'numTestPositives:', np.sum(test_labels)
                 )



atf3_veh roc: 0.836974709714 5.61270208208e-06 precision: 0.711601594732 3.53410618648e-06 numTestPositives: 10991
cjun_veh roc: 0.812956501449 3.90929117893e-06 precision: 0.470326834035 6.15164296518e-06 numTestPositives: 6346
fos_veh roc: 0.85820935437 9.32261322104e-06 precision: 0.354988988947 4.44600984708e-05 numTestPositives: 971
junb_veh roc: 0.685413905353 8.2720549774e-05 precision: 0.0214656931746 2.62037146045e-07 numTestPositives: 244
jund_veh roc: 0.806430962408 3.85320637865e-06 precision: 0.56506012027 2.15782634439e-05 numTestPositives: 9146
atf3_kla roc: 0.831525390212 4.86761024132e-07 precision: 0.802433251593 3.69917466276e-06 numTestPositives: 17245
cjun_kla roc: 0.807262952194 1.08763770612e-06 precision: 0.480549590428 4.3931060878e-06 numTestPositives: 8022
fos_kla roc: 0.832284162804 1.86405547115e-06 precision: 0.661452757926 1.67635553822e-05 numTestPositives: 10670
junb_kla roc: 0.829456310449 2.86007019466e-06 precision: 0.480705564122 1.51660288731e-05 n

## Create background peaks from genomic sequences from each chromosome

In [210]:
def getRandomBackgroundSwitch(target_positions, 
                        size_ratio = 1.0, 
                        tolerance = 0.05, 
                        N_threshold = 0.5 ):
    '''
    target_sequences: 2D numpy array, list of genomic coordinates for target sequences [[chr,start,end],...]
    size_ratio: float, number of background sequences relative to target sequences
    tolerance: float, max difference in GC content between True and background labelled samples
    *** Uses mm10 genome taken from Homer ***
    '''
    
    ###load mm10 genome into memory
    
    # index target positions
    # {chr:[]}, value is chromosome length boolean array
    # largest chromosome has 200 million bps 
    _chromosomes = ['chr1' , 'chr2' , 'chr3' , 'chr4' , 'chr5' , 
                    'chr6' , 'chr7' , 'chr8' , 'chr9' , 'chr10', 
                    'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 
                    'chr16', 'chr17', 'chr18', 'chr19', 'chrX']
    _chrom_size_dict = {}
    _chrom_seq_dict = {}
    for chrom in _chromosomes:
        with open('./mm10_genome/' + chrom + '.fa') as f:
            data = f.readlines()
        seq = ''.join(x.upper().strip() for x in data[1:])
        size = len(seq)
        _chrom_size_dict[chrom] = size
        _chrom_seq_dict[chrom] = seq
    _numChromosomes = len(_chromosomes)
    
    target_chr_position_dict = {x:np.zeros(200000000) for x in _chromosomes} 
    ### initialize target_chr_position_dict using target positions
    ### retreive target sequences
    target_sequences = []
    for pos in target_positions:
        chrom = pos[0]        
        start = pos[1]
        end = pos[2]
        target_chr_position_dict[chrom][start-1:end] = 1 # use 0 indexing of position, versus 1 indexing used in fasta
        seq = _chrom_seq_dict[chrom][start:(end+1)]
        target_sequences.append(seq)
    ### calculate GC content and average length of the target sequences
    target_gc_count = 0
    target_length_count = 0
    for s in target_sequences:
        target_gc_count += s.count('G')
        target_gc_count += s.count('C')
        target_length_count += len(s)
    target_gc_content = target_gc_count/(target_length_count+0.0000001) # GC content of target sequences
    mean_target_length = target_length_count/len(target_sequences) # average length of target sequences
    mean_target_length = int(mean_target_length)
    
    ### select random genomic loci such that they do no overlap target sequences
    numSelected = 0
    numToSelect = len(target_positions) * size_ratio * 2 # candidate pool of background seqs is 2X larger
    candidate_positions = []
    while numSelected < numToSelect:
        # select random chromsome
        chromIndex = np.random.randint(_numChromosomes)
        randChrom = _chromosomes[chromIndex]
        randChromSize = _chrom_size_dict[randChrom]
        # must find non overlapping segment on this chromosome before moving on
        selectedSequence = False
        while not selectedSequence:
            randStart = np.random.randint(randChromSize)
            randEnd = randStart + mean_target_length
            overlap_sum = np.sum(target_chr_position_dict[randChrom][randStart:(randEnd + 1)])
            if not overlap_sum > 0:
                selectedSequence = True
                numSelected+=1
                candidate_positions.append([randChrom, randStart, randEnd])

    ### retrieve sequences of random genomic loci
    numFiltered=0
    filtered_candidate_positions = []
    numNallowed = int(N_threshold * mean_target_length)
    for cp in candidate_positions:
        chrom = cp[0]
        start = cp[1]
        end = cp[2]
        candidate_seq = _chrom_seq_dict[chrom][start:(end+1)]
        numN = candidate_seq.count('N')
        # throw away background peaks containing greater than this fraction of N
        if numN <= numNallowed:
            filtered_candidate_positions.append((chrom, start, end,candidate_seq))

    if len(filtered_candidate_positions) < len(target_positions):
        print('The genome is vast and empty and filled with Ns')
        return None
       
    ### select random set of candidate background loci
    random.shuffle(filtered_candidate_positions)
    
    toReturn_positions = filtered_candidate_positions[:len(target_positions)]
    remaining_positions = filtered_candidate_positions[len(target_positions):]
    # calcuate GC content of background samples
    background_gc_count = 0
    background_length = 0
    for trp in toReturn_positions:
        s = trp[3]
        background_gc_count += s.count('G')
        background_gc_count += s.count('C')
        background_length += len(s)
    background_gc_content = background_gc_count/(background_length+0.0000001)
        
    numToReturn = len(toReturn_positions)
    numRemaining = len(remaining_positions)
    counter = 0
    while abs(target_gc_content - background_gc_content) > tolerance:
        # swith background GC sequences until GC content matches tolerance
        switched_seq = False       
        while not switched_seq:
            # sequence to be switched out
            rand_index = np.random.randint(numToReturn)
            current_seq = toReturn_positions[rand_index][3]
            current_gc_count = current_seq.count('G')+ current_seq.count('C')
            current_length = len(current_seq)
            current_gc = current_gc_count/current_length
            
            # sequence to be switched out
            switch_index = np.random.randint(numRemaining)
            switch_seq = remaining_positions[switch_index][3]
            switch_gc_count = switch_seq.count('G')+ switch_seq.count('C')
            switch_length = len(switch_seq)
            switch_gc = switch_gc_count/switch_length
            if target_gc_content > background_gc_content:
                # switch sequences that would increase overall GC content of background sequences
                if switch_gc > current_gc:
                    switched_seq = True
            else:
                # switch sequences that would decrease overall GC content of background sequences
                if switch_gc < current_gc:
                    switched_seq = True
        counter +=1
        if counter % 1000 == 0:
            print(background_gc_content, target_gc_content, tolerance)
        # switch sequences
        temp_pos = toReturn_positions[rand_index]
        toReturn_positions[rand_index] = remaining_positions[switch_index]
        remaining_positions[switch_index] = temp_pos

        # update background GC content
        background_gc_count -= current_gc_count
        background_length -= current_length
        background_gc_count += switch_gc_count
        background_length += switch_length
        
        background_gc_content = background_gc_count/background_length
          
        
    print(target_gc_content, background_gc_content)
    return None
    

In [225]:
def getRandomBackground(target_positions, 
                        size_ratio = 1.0, 
                        tolerance = 0.01, 
                        N_threshold = 0.5 ):
    '''
    target_sequences: 2D numpy array, list of genomic coordinates for target sequences [[chr,start,end],...]
    size_ratio: float, number of background sequences relative to target sequences
    tolerance: float, max difference in GC content between True and background labelled samples
    *** Uses mm10 genome taken from Homer ***
    '''
    
    ###load mm10 genome into memory
    
    # index target positions
    # {chr:[]}, value is chromosome length boolean array
    # largest chromosome has 200 million bps 
    _chromosomes = ['chr1' , 'chr2' , 'chr3' , 'chr4' , 'chr5' , 
                    'chr6' , 'chr7' , 'chr8' , 'chr9' , 'chr10', 
                    'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 
                    'chr16', 'chr17', 'chr18', 'chr19', 'chrX']
    _chrom_size_dict = {}
    _chrom_seq_dict = {}
    for chrom in _chromosomes:
        with open('./mm10_genome/' + chrom + '.fa') as f:
            data = f.readlines()
        seq = ''.join(x.upper().strip() for x in data[1:])
        size = len(seq)
        _chrom_size_dict[chrom] = size
        _chrom_seq_dict[chrom] = seq
    _numChromosomes = len(_chromosomes)
    
    target_chr_position_dict = {x:np.zeros(200000000) for x in _chromosomes} 
    ### initialize target_chr_position_dict using target positions
    ### retreive target sequences
    target_sequences = []
    for pos in target_positions:
        chrom = pos[0]        
        start = pos[1]
        end = pos[2]
        target_chr_position_dict[chrom][start-1:end] = 1 # use 0 indexing of position, versus 1 indexing used in fasta
        seq = _chrom_seq_dict[chrom][start:(end+1)]
        target_sequences.append(seq)
    ### calculate GC content and average length of the target sequences
    target_gc_count = 0
    target_length_count = 0
    for s in target_sequences:
        target_gc_count += s.count('G')
        target_gc_count += s.count('C')
        target_length_count += len(s)
    target_gc_content = target_gc_count/(target_length_count+0.0000001) # GC content of target sequences
    mean_target_length = target_length_count/len(target_sequences) # average length of target sequences
    mean_target_length = int(mean_target_length)
    
    ### select random genomic loci such that they do no overlap target sequences
    numSelected = 0
    numToSelect = len(target_positions) * size_ratio * 2 # candidate pool of background seqs is 2X larger
    candidate_positions = []
    numNallowed = int(N_threshold * mean_target_length) # number of allowable Ns
    counter = 0
    while numSelected < numToSelect:
        if counter % 100000 == 0:
            print(counter, numSelected)
        # select random chromsome
        chromIndex = np.random.randint(_numChromosomes)
        randChrom = _chromosomes[chromIndex]
        randChromSize = _chrom_size_dict[randChrom]
        # must find non overlapping segment on this chromosome before moving on
        selectedSequence = False
        while not selectedSequence:
            counter += 1
            randStart = np.random.randint(randChromSize)
            randEnd = randStart + mean_target_length
            overlap_sum = np.sum(target_chr_position_dict[randChrom][randStart:(randEnd + 1)])
            
            if not overlap_sum > 0:
                randSeq = _chrom_seq_dict[randChrom][randStart:(randEnd+1)]
                numN = randSeq.count('N')
                if numN <= numNallowed:
                    rand_gc_count = randSeq.count('G')+ randSeq.count('C')
                    rand_gc = rand_gc_count/mean_target_length
                    if abs(target_gc_content - rand_gc) <= tolerance:
                        selectedSequence = True
                        numSelected+=1
                        candidate_positions.append([randChrom, randStart, randEnd, randSeq])
    # calcuate GC content of background samples
    background_gc_count = 0
    background_length = 0
    for cp in candidate_positions:
        s = cp[3]
        background_gc_count += s.count('G')
        background_gc_count += s.count('C')
        background_length += len(s)
    background_gc_content = background_gc_count/(background_length+0.0000001)
    print(target_gc_content,background_gc_content)
    return candidate_positions

In [234]:
# generate random genomic background for all monomers
strain = 'c57bl6'
for treatment in ['veh', 'kla']:
    for monomer in ap1_members:
        target_indices = summary_frame[summary_frame[strain + '_' + monomer + '_' + treatment] > 0.0].index.values
        target_positions = summary_frame[summary_frame.index.isin(target_indices)][['chr', 'start', 'end']].values
        start = time.time()
        backgroundPositions = getRandomBackground(target_positions, N_threshold =1.0, tolerance=0.05, size_ratio=5)
        end = time.time()
        print(monomer, treatment, end - start)
        pickle.dump(backgroundPositions,open('./background_pickles/' + monomer + '_' + treatment + '_background.pickle', 'wb'))

### Background positions to create peak files


In [236]:
strain = 'c57bl6'
! if [ ! -d ./background_peak_files ]; mkdir ./background_peak_files
for treatment in ['veh', 'kla']:
    for monomer in ap1_members:
        backgroundPositions = pickle.load(open('./background_pickles/' + monomer + '_' + treatment + '_background.pickle', 'rb'))
        
    
    

### create a script to scan for motifs using FIMO



In [None]:
! if [ ! -d ./fimo_results/ ]; then mkdir ./fimo_results/; fi
! if [ ! -d ./fimo_out/ ]; then mkdir ./fimo_out/; fi


pthresh = 0.01
motif_dir = '/home/jenhan/analysis/cobinding_motif_analysis/fimo_motifs/'
fimo_results_dir = './fimo_results'


peakDirectory =  './group_by_chromosome/'
for chrom in os.listdir(peakDirectory):
    scriptFile = open('scanMotifs_' + chrom + '.sh','w')
    for m in os.listdir(motif_dir):
        fimo_out_dir = './fimo_out/' + chrom + '_' + m.replace('.fimo','')

        if 'fimo' in m:
            outPath = fimo_results_dir + '/' +chrom + '_'+ m.replace('.fimo','') +'.txt'
            scriptFile.write(
                'fimo --text --max-stored-scores 2000000 --output-pthresh ' + 
                str(pthresh)  + ' ' +
                motif_dir + '/' + m + ' ./group_by_chromosome/' + chrom + '/' + chrom + '_tile.fa ' +
                '> ' + outPath + ' & \n')
    scriptFile.close()






In [None]:
%%bash
chmod a+x ./scanMotifs*.sh
for i in ./scanMotifs*sh; 
    do echo 'sleeping...';
    echo $i;
    $i;
    sleep 5m;
done



### Read in Motif Scores

In [None]:
Create background summary frame