# Unclustered Foreground Generation

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"

### imports ###
import sys
import os
import pandas as pd
import numpy as np
import matplotlib
import itertools
import scipy
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn import preprocessing
import sklearn
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
import matplotlib_venn
from sklearn.cross_validation import train_test_split
from random import shuffle
import threading
import time
from collections import Counter
### notebook specific configuration ###
%matplotlib inline
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(5000)
working_dir = '/gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_unclustered_features/'
if not os.path.isdir(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2
%env PATH=/gpfs/data01/glasslab/home/jtao/perl5/bin:/gpfs/data01/glasslab/home/jtao/software/anaconda3/bin:/home/jtao/software/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/bin:/gpfs/data01/glasslab/home/jtao/software/homer/bin:/gpfs/data01/glasslab/home/jtao/software/weblogo:/home/jtao/code/seq_merge_pipe:/home/vlink/mouse_strains/marge/shifting:/bioinformatics/glassutils/scripts:/bioinformatics/software/meme/bin:/home/jtao/software/lsgkm/bin

env: PATH=/gpfs/data01/glasslab/home/jtao/perl5/bin:/gpfs/data01/glasslab/home/jtao/software/anaconda3/bin:/home/jtao/software/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/bin:/gpfs/data01/glasslab/home/jtao/software/homer/bin:/gpfs/data01/glasslab/home/jtao/software/weblogo:/home/jtao/code/seq_merge_pipe:/home/vlink/mouse_strains/marge/shifting:/bioinformatics/glassutils/scripts:/bioinformatics/software/meme/bin:/home/jtao/software/lsgkm/bin


## copy motif files

In [45]:
%%bash
if [ ! -d ./fimo_motifs ]; then mkdir ./fimo_motifs; else rm ./fimo_motifs/*; fi
for f in /gpfs/data01/glasslab/home/jtao/analysis/ap1_motif_merging/homer_motifs/*;
do motif=${f##*/};
motif=${motif%.homer}
/gpfs/data01/glasslab/home/jtao/code/tba/homer2fimo.py $f ./fimo_motifs/${motif}.fimo
done

In [3]:
%%bash 
cp /gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_features/summary_frame.tsv ./
cp /gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_background_features/background_group_summary.tsv ./



cp: cannot stat ‘/gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_features/summary_frame.tsv’: No such file or directory


In [4]:
# read in peak data data
summary_frame = pd.read_csv('./group_summary.tsv' , sep='\t')
summary_frame = summary_frame.fillna('0')
for col in summary_frame.columns[5:]:
    floatValues = []
    for val in summary_frame[col].values.astype(str):
        if ',' in val:
            maxVal = np.mean([float(x) for x in val.split(',')])
            floatValues.append(maxVal)
        else:
            floatValues.append(float(val))
    summary_frame[col] = floatValues
summary_frame.index = summary_frame['ID'].values

# remove peaks in unknown/random chromosomes
summary_frame = summary_frame[~summary_frame['chr'].str.contains('random')]
summary_frame = summary_frame[~summary_frame['chr'].str.contains('Un')]

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# read in peak data data
summary_background_frame = pd.read_csv('./background_group_summary.tsv' , sep='\t')
summary_background_frame = summary_background_frame.fillna('0')
for col in summary_background_frame.columns[5:]:
    floatValues = []
    for val in summary_background_frame[col].values.astype(str):
        if ',' in val:
            maxVal = np.mean([float(x) for x in val.split(',')])
            floatValues.append(maxVal)
        else:
            floatValues.append(float(val))
    summary_background_frame[col] = floatValues
summary_background_frame.index = summary_background_frame['ID'].values

  interactivity=interactivity, compiler=compiler, result=result)


### Retrieve Sequences Under Peaks

In [31]:
%%bash
if [ ! -d ./peak_sequences ] ;
    then mkdir ./peak_sequences
else
    rm ./peak_sequences/*
fi

In [6]:
%%bash 
cp /gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_features/peak_sequences/C57BL6J.fa ./peak_sequences/
cp /gpfs/data01/glasslab/home/jtao/analysis/ap1_analysis_background_features/peak_sequences/C57BL6J.fa ./peak_sequences/C57BL6J-background.fa

In [19]:
!wc -l peak_sequences/*

   799784 peak_sequences/C57BL6J-background.fa
   234978 peak_sequences/C57BL6J.fa
   103986 peak_sequences/C57BL6J_marge.fa
  1138748 total


### create a script to scan for motifs using FIMO



In [7]:
# create a script to scan for motifs using FIMO
! if [ ! -d /home/jtao/analysis/cobinding_motif_analysis/fimo_results/ ]; then mkdir /home/jtao/analysis/cobinding_motif_analysis/fimo_results/; fi
! rm -rf fimo_out/*
! rm -rf fimo_result/*


pthresh = 0.01
motif_dir = './fimo_motifs/'

fimo_results_dir = './fimo_results'

for fastaFile in os.listdir('./peak_sequences/'):
    if not 'marge' in fastaFile:
        print(fastaFile)
        strain = fastaFile.split('.')[0]
        count = 0
        scriptFile = open('scanMotifs_'+ strain +'.sh','w')
        for m in sorted(os.listdir(motif_dir)):
            if '.fimo' in m:
                fimo_out_dir = './fimo_out/' + strain + '_' +m.replace('.fimo','')
                outPath = fimo_results_dir + '/' +strain + '_' + m.replace('.fimo','') +'.txt'
                scriptFile.write(
        #             '(sleep ' + str(15 * count) + 
                    '(sleep ' + str(0 * count) + 
                    's; fimo --text --max-stored-scores 2000000 --output-pthresh ' + 
                    str(pthresh) +' --oc ' + fimo_out_dir + ' ' +
                    motif_dir + '/' + m + ' ./peak_sequences/' + fastaFile +
                    '> ' + outPath + ' ) & \n')
                count+=1
        scriptFile.close()

C57BL6J.fa
C57BL6J-background.fa


In [8]:
%%bash
if [ ! -d ./fimo_results ]; then mkdir ./fimo_results; else rm -rf ./fimo_results/*; fi
chmod a+x ./scanMotifs*
./scanMotifs_C57BL6J.sh
./scanMotifs_C57BL6J-background.sh

Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +ARNT::HIF1A of width 8.
Using motif -ARNT::HIF1A of width 8.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +ALX3 of width 10.
Using motif -ALX3 of width 10.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +ATF4 of width 13.
Using motif -ATF4 of width 13.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +Ar of width 17.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequen

## Read in Motif Scores

In [9]:
def read_fimo_file(thread_lock, motif_results_path, 
                   all_peak_ids,
                   motif_score_dict,
                   motif_sequence_dict,
                   motif_strand_dict,
                   motif_start_dict,
                   motif_end_dict,
                   motif_count_dict,
                  ):
    
    # read in fimo result as data frame
    fimo_result_frame=pd.read_csv(motif_results_path, 
                                  skiprows=1,
                                  names=['motif_name', 
                                         'peak_id', 
                                         'start', 
                                         'stop', 
                                         'strand', 
                                         'score', 
                                         'pvalue', 
                                         'sequence'],
                                  sep='\t')
    motif_name = fimo_result_frame['motif_name'].values[0]
    print('reading', motif_name)
    
    id_values_dict = {} # {PeakID:(motifScore, motifSequence, motifStrand, motifStart, motifEnd)}
    # drop all motif instances that has less than the maximum score
    sorted_fimo_result_frame = fimo_result_frame.sort_values(by='score', ascending=False)
    top_fimo_result_frame = sorted_fimo_result_frame.drop_duplicates(subset='peak_id')
    
    # convert data frame to a dictionary 
    unique_peak_ids = top_fimo_result_frame['peak_id'].values
    scores = top_fimo_result_frame['score'].values
    strands = top_fimo_result_frame['strand'].values
    sequences = top_fimo_result_frame['sequence'].values
    starts = top_fimo_result_frame['start'].values
    ends = top_fimo_result_frame['stop'].values
    
    for i in range(len(unique_peak_ids)):
        currentPeakID = unique_peak_ids[i]
        currentScore = float(scores[i])
        currentSequence = sequences[i]
        currentStrand = strands[i]
        currentStart = int(starts[i])
        currentEnd = int(ends[i])
        # bundle values
        
        if currentScore < 0.0:
            currentScore = 0.0
        
        newValues = (currentScore, 
                     currentSequence, 
                     currentStrand, 
                     currentStart, 
                     currentEnd, 
                     )
        id_values_dict[currentPeakID] = newValues
    
    # sort values according to all peak IDs
    sorted_values = [id_values_dict[x] if x in id_values_dict else (0,'','?',-1,-1) for x in  all_peak_ids]
    sorted_scores = [x[0] for x in sorted_values]
    sorted_sequences = [x[1] for x in sorted_values]
    sorted_strands = [x[2] for x in sorted_values]
    sorted_starts = [x[3] for x in sorted_values]
    sorted_ends = [x[4] for x in sorted_values]
    
    # count the number of motif instances
    peak_ids = fimo_result_frame['peak_id'].values
    id_count_dict = Counter(peak_ids)
    
    sorted_counts = [id_count_dict[x] if x in id_count_dict else 0 for x in all_peak_ids]
    del fimo_result_frame # release result frame from memory hopefully
    while thread_lock.locked_lock():
        time.sleep(1)
    thread_lock.acquire()
        
    motif_score_dict[motif_name] = sorted_scores
    motif_sequence_dict[motif_name] = sorted_sequences
    motif_strand_dict[motif_name] = sorted_strands
    motif_start_dict[motif_name] = sorted_starts
    motif_end_dict[motif_name] = sorted_ends
    motif_count_dict[motif_name] = sorted_counts
    # release lock
    print('finished reading', motif_name )
    if thread_lock.locked_lock():
        thread_lock.release()
    
    return None

In [None]:
start = time.time()
fimo_result_path = './fimo_results/'
for fastaFile in os.listdir('./peak_sequences/'):
    if not 'marge' in fastaFile:
        print('*****',fastaFile,'*****')
        strain = fastaFile.split('.')[0]
        motif_score_dict = {}
        motif_sequence_dict ={}
        motif_strand_dict = {}
        motif_start_dict = {}
        motif_end_dict = {}
        motif_count_dict = {}
        thread_list = []
        all_peak_ids = summary_frame.index.values
        lock = threading.Lock()
        for m in sorted(os.listdir('./fimo_results')):
            if strain + '_' in m:
                try:
                    current_thread = threading.Thread(target = read_fimo_file,
                                     args =(lock,
                                            fimo_result_path + '/' + m, 
                                            all_peak_ids,
                                            motif_score_dict,
                                            motif_sequence_dict,
                                            motif_strand_dict,
                                            motif_start_dict,
                                            motif_end_dict,
                                            motif_count_dict,
                                            ))
                    thread_list.append(current_thread)
                    current_thread.start()
                finally:
                    if lock.locked_lock():
                        lock.release()
        for current_thread in thread_list:
            current_thread.join()
        
        # convert dictionaries to data frames
        motif_score_frame = pd.DataFrame(motif_score_dict , index = summary_frame['ID'].values)
        motif_sequence_frame = pd.DataFrame(motif_sequence_dict , index = summary_frame['ID'].values)
        motif_strand_frame = pd.DataFrame(motif_strand_dict , index = summary_frame['ID'].values)
        motif_start_frame = pd.DataFrame(motif_start_dict , index = summary_frame['ID'].values)
        motif_end_frame = pd.DataFrame(motif_end_dict , index = summary_frame['ID'].values)
        motif_count_frame = pd.DataFrame(motif_count_dict , index = summary_frame['ID'].values)

        for frame in [motif_score_frame, 
                      motif_sequence_frame, 
                      motif_strand_frame, 
                      motif_start_frame, 
                      motif_end_frame, 
                      motif_count_frame]:
            motif_cols = frame.columns.values
            frame['ID'] = summary_frame['ID'].values
            frame['Factors'] = summary_frame['Factors'].values
            frame['chr'] = summary_frame['chr'].values

        motif_score_frame.to_pickle('motif_score_unclustered_frame_'+  strain + '.pickle')

        motif_sequence_frame.to_pickle('motif_sequence_unclustered_frame_'+  strain + '.pickle')

        motif_strand_frame.to_pickle('motif_strand_unclustered_frame_'+  strain + '.pickle')

        motif_start_frame.to_pickle('motif_start_unclustered_frame_'+  strain + '.pickle')

        motif_end_frame.to_pickle('motif_end_unclustered_frame_'+  strain + '.pickle')

        motif_count_frame.to_pickle('motif_count_unclustered_frame_'+  strain + '.pickle')
        
end = time.time()
print('total time', end - start)






***** C57BL6J-background.fa *****
