# Strains Analysis

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"

### imports ###
import sys
import os
import pandas as pd
import numpy as np
import argparse
import matplotlib
import itertools
import scipy
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import scipy
import pickle
from sklearn import preprocessing
import sklearn
from sklearn import decomposition
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
from sklearn import ensemble
from sklearn import neighbors
import networkx as nx
import matplotlib_venn
from sklearn.cross_validation import train_test_split
from random import shuffle

### notebook specific configuration ###
%matplotlib inline
sys.path.append("/home/jenhan/code/seq_merge_pipe/")
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
os.chdir('/home/jenhan/analysis/strains_analysis/')
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2



## Generate Score Files

In [3]:
%%bash
if [ ! -d ./group ]; then mkdir ./group; fi
mergePeaks -d given -venn venn.txt ./peak_files/* > ./group/merged_peaks.tsv
makeSummaryFile.py ./group/merged_peaks.tsv ./group_summary.tsv ./peak_files/*



Finished reading merged peak file...
Integrating scores for balb_cebpa_kla
Integrating scores for balb_cebpa_veh
Integrating scores for balb_pu1_kla
Integrating scores for balb_pu1_veh
Integrating scores for c57bl6_cebpa_kla
Integrating scores for c57bl6_cebpa_veh
Integrating scores for c57bl6_pu1_kla
Integrating scores for c57bl6_pu1_veh


	Max distance to merge: direct overlap required (-d given)
	Merging peaks... 
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/balb_cebpa_veh_peaks.tsv (23349 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/balb_pu1_kla_peaks.tsv (60743 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/balb_pu1_veh_peaks.tsv (58546 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/c57bl6_cebpa_kla_peaks.tsv (23276 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/c57bl6_cebpa_veh_peaks.tsv (19599 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/c57bl6_pu1_kla_peaks.tsv (59749 total)
	Comparing ./peak_files/balb_cebpa_kla_peaks.tsv (24836 total) and ./peak_files/c57bl6_pu

## Annotate Peak File

In [None]:
%%bash
annotatePeaks.pl ./group/merged_peaks.tsv mm10 > merged_annotated_peaks.tsv &



## Retrieve Peak Sequences

In [6]:
%%bash
/home/jenhan/code/motif_tools/getTargetSequencesWrapper.sh ./group/ ./group 0

python extendPeaks.py ./group//merged_peaks.tsv ./group//merged_extPeaks.tsv 0
homerTools extract ./group//merged_extPeaks.tsv /bioinformatics/homer/data/genomes/mm10 -fa > ./group/merged.fa



	Extracting sequences from directory: /bioinformatics/homer/data/genomes/mm10
	Extracting 6640 sequences from chr1
	Extracting 7677 sequences from chr2
	Extracting 4963 sequences from chr3
	Extracting 5370 sequences from chr4
	Extracting 5534 sequences from chr5
	Extracting 5464 sequences from chr6
	Extracting 5283 sequences from chr7
	Extracting 4314 sequences from chr8
	Extracting 4878 sequences from chr9
	Extracting 5302 sequences from chr10
	Extracting 6527 sequences from chr11
	Extracting 3727 sequences from chr12
	Extracting 4540 sequences from chr13
	Extracting 3674 sequences from chr14
	Extracting 3910 sequences from chr15
	Extracting 3459 sequences from chr16
	Extracting 4051 sequences from chr17
	Extracting 2779 sequences from chr18
	Extracting 3011 sequences from chr19
	Extracting 2156 sequences from chrX
	Extracting 2 sequences from chrY
	Extracting 5 sequences from chr4_GL456216_random
	Extracting 1 sequences from chr4_JH584295_random
	Extracting 1 sequences from chrUn_GL

In [7]:
### read in sequences as dictionary {peakID: sequence}
with open('./group/merged.fa') as f:
    data = f.readlines()

_id_sequence_dict = {}
for line in data:
    if line[0] == '>':
        sequenceName = line.strip()[1:]
    else:
        _id_sequence_dict[sequenceName] = line.strip().upper()
        

## Scan for Motifs using Fimo

In [12]:
# create a script to scan for motifs using FIMO
! if [ ! -d ./fimo_out/ ]; then mkdir ./fimo_out; fi
! if [ ! -d ./fimo_results/ ]; then mkdir ./fimo_results; fi


pthresh = 0.01
motif_dir = '/home/jenhan/analysis/cobinding_motif_analysis/fimo_motifs/'

fimo_results_dir = './fimo_results'
count = 0
scriptFile = open('scanMotifs.sh','w')
for m in os.listdir(motif_dir):
    if 'fimo' in m:
        fimo_out_dir = '/home/jenhan/analysis/strains_analysis/fimo_out/' + m.replace('.fimo','')
        outPath = fimo_results_dir + '/merged_'+ m.replace('.fimo','') +'.txt'
        scriptFile.write(
#             '(sleep ' + str(15 * count) + 
            '(sleep ' + str(0 * count) + 
            's; fimo --text --max-stored-scores 2000000 --output-pthresh ' + 
            str(pthresh) +' --oc ' + fimo_out_dir + ' ' +
            motif_dir + '/' + m + ' /home/jenhan/analysis/strains_analysis/group/merged.fa '+
            '> ' + outPath + ' ) & \n')
        count+=1
scriptFile.close()




In [11]:
%%bash
chmod a+x ./scanMotifs.sh
./scanMotifs.sh

Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +spib of width 7.
Using motif -spib of width 7.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Using motif +duxa of width 13.
Using motif -duxa of width 13.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Read 1 motifs.
Reading txt file vmajor: 4, vminor: 4, vbug: 0
Using nucleotide alphabet (ACGTURYKMSWBDHVN).
Using background frequencies from NR sequence database.
Using motif +dux of width 8.
Read 1 motifs.
Reading txt file vmajor: 4, v

## Read in Score Files

### Read in peak scores

In [13]:
# read in peak data data
summary_frame = pd.read_csv('./group_summary.tsv' , sep='\t')
summary_frame = summary_frame.fillna('0')
for col in summary_frame.columns[5:]:
    floatValues = []
    for val in summary_frame[col].values.astype(str):
        if ',' in val:
            maxVal = np.max([float(x) for x in val.split(',')])
            floatValues.append(maxVal)
        else:
            floatValues.append(float(val))
    summary_frame[col] = floatValues
summary_frame.index = summary_frame['ID'].values

# remove peaks in unknown/random chromosomes
summary_frame = summary_frame[~summary_frame['chr'].str.contains('random')]
summary_frame = summary_frame[~summary_frame['chr'].str.contains('Un')]


_factors = sorted(list(set([x.split('_')[1] for x in summary_frame.columns if '_' in x])))
summary_frame.to_pickle('summary_frame.pickle')
scaler = sklearn.preprocessing.MinMaxScaler()
normed_summary_frame = pd.DataFrame(scaler.fit_transform(summary_frame.ix[:,5:]))
normed_summary_frame.columns = summary_frame.columns.values[5:]
normed_summary_frame.index = summary_frame.index.values

### Read in motif scores

In [None]:

motif_dir = '/home/jenhan/analysis/cobinding_motif_analysis/fimo_motifs/'

peak_start_dict = dict(zip(summary_frame['ID'].values, summary_frame['start'].values))

motif_score_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_score_frame.index=motif_score_frame['ID'].values

motif_sequence_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_sequence_frame.index=motif_score_frame['ID'].values

motif_strand_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_strand_frame.index=motif_score_frame['ID'].values

motif_start_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_start_frame.index=motif_score_frame['ID'].values

motif_end_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_end_frame.index=motif_score_frame['ID'].values

motif_count_frame = summary_frame[['ID', 'Factors', 'chr']]
motif_count_frame.index = motif_count_frame['ID'].values

counter=0
for m in sorted(os.listdir(motif_dir)):
    counter+=1
    if '.fimo' in m:
        print(counter,m)
        motif_results = './fimo_results//merged_' + m.replace('.fimo','') +'.txt'
        fimo_result_frame=pd.read_csv(motif_results, 
                                      skiprows=1,
                                      names=['motif_name', 
                                             'peak_id', 
                                             'start', 
                                             'stop', 
                                             'strand', 
                                             'score', 
                                             'pvalue', 
                                             'sequence'],
                                      sep='\t')
        motif_name = m.replace('.fimo','')
        id_score_dict = {}
        id_strand_dict = {}
        id_sequence_dict = {}
        id_start_dict = {}
        id_end_dict = {}
        id_count_dict = {}
        
        ids = fimo_result_frame['peak_id'].values
        scores = fimo_result_frame['score'].values
        strands = fimo_result_frame['strand'].values
        sequences = fimo_result_frame['sequence']
        starts = fimo_result_frame['start']
        ends = fimo_result_frame['stop']
        
        for i in range(len(ids)):
            currentScore = float(scores[i])
            currentSequence = sequences[i]
            currentStrand = strands[i]
            currentStart = int(starts[i])
            currentEnd = int(ends[i])
            if currentScore < 0.0:
                currentScore = 0.0
            if ids[i] in id_score_dict:
                if currentScore > id_score_dict[ids[i]]:
                    id_score_dict[ids[i]] = currentScore
                    id_sequence_dict[ids[i]] = currentSequence
                    id_strand_dict[ids[i]] = currentStrand
                    id_start_dict[ids[i]] = currentStart
                    id_end_dict[ids[i]] = currentEnd
                    id_count_dict[ids[i]] += 1
            else:
                id_score_dict[ids[i]] = currentScore
                id_sequence_dict[ids[i]] = currentSequence
                id_strand_dict[ids[i]] = currentStrand
                id_start_dict[ids[i]] = currentStart
                id_end_dict[ids[i]] = currentEnd
                id_count_dict[ids[i]] = 0
                
        motif_score_frame[motif_name] = [id_score_dict[x] if x in id_score_dict else 0 for x in motif_score_frame['ID'].values]
        motif_sequence_frame[motif_name] = [id_sequence_dict[x] if x in id_sequence_dict else '?' for x in motif_sequence_frame['ID'].values]
        motif_strand_frame[motif_name] = [id_strand_dict[x] if x in id_score_dict else '' for x in motif_strand_frame['ID'].values]
        motif_start_frame[motif_name] = [id_start_dict[x] + peak_start_dict[x] if x in id_start_dict else -1 for x in motif_start_frame['ID'].values]
        motif_end_frame[motif_name] = [id_end_dict[x] + peak_start_dict[x] if x in id_end_dict else -1 for x in motif_end_frame['ID'].values]
        motif_count_frame[motif_name] = [id_count_dict[x] if x in id_count_dict else 0 for x in motif_count_frame['ID'].values]


        
motif_score_frame.to_pickle('motif_score_frame.pickle')
motif_score_frame.to_csv('motif_scores.tsv', sep='\t', index=False)

motif_sequence_frame.to_pickle('motif_sequence_frame.pickle')
motif_sequence_frame.to_csv('motif_sequence.tsv', sep='\t', index=False)

motif_strand_frame.to_pickle('motif_strand_frame.pickle')
motif_strand_frame.to_csv('motif_strand.tsv', sep='\t', index=False)

motif_start_frame.to_pickle('motif_start_frame.pickle')
motif_start_frame.to_csv('motif_start_frame.tsv', sep='\t', index=False)

motif_end_frame.to_pickle('motif_end_frame.pickle')
motif_end_frame.to_csv('motif_end_frame.tsv', sep='\t', index=False)

motif_count_frame.to_pickle('motif_count_frame.pickle')
motif_count_frame.to_csv('motif_count_frame.tsv', sep='\t', index=False)

scaler = preprocessing.MinMaxScaler()
normed_motif_frame = pd.DataFrame(scaler.fit_transform(motif_score_frame.ix[:,3:]))
normed_motif_frame.columns = motif_score_frame.columns.values[3:]
normed_motif_frame.index = motif_score_frame.index.values

scaled_motif_frame = pd.DataFrame()
for m in motif_score_frame.columns[3:]:
    scaled_motif_frame[m] = motif_score_frame[m]/(2*motif_sequence_frame[m].str.len().max())

1 alx1_alx4_arx.fimo
2 ap-1.fimo
3 ar_nr3c1_nr3c2.fimo
4 arid3a.fimo
5 arid3b.fimo
6 arid5a.fimo
7 arnt_mycn.fimo
8

In [15]:
scaled_motif_frame

Unnamed: 0,alx1_alx4_arx,ap-1,ar_nr3c1_nr3c2,arid3a,arid3b,arid5a,arnt_mycn,arntl_mitf,ascl2_nhlh1,atf7_batf3_creb5,...,zbtb7,zeb1,zfx,zic,znf143,znf263,znf354c,znf410,znf423,znf740
Merged-chr17-34627665-3,0.125804,0.000000,0.000000,0.508086,0.174588,0.172317,0.063901,0.376679,0.304938,0.000000,...,0.329533,0.263295,0.327267,0.052839,0.000000,0.182261,0.596505,0.000000,0.714523,0.831465
Merged-chr16-57200743-3,0.053001,0.924300,0.000000,0.508086,0.343869,0.250259,0.057756,0.224335,0.329444,0.000000,...,0.354112,0.285146,0.274590,0.080844,0.000000,0.110720,0.375262,0.000000,0.000000,0.445052
Merged-chr9-70560828-3,0.000000,0.243004,0.027416,0.778647,0.044371,0.338366,0.315874,0.352108,0.314740,0.000000,...,0.000000,0.132187,0.134118,0.014333,0.000000,0.115336,0.678447,0.000000,0.003264,0.267618
Merged-chr5-30893945-3,0.000000,0.000000,0.000000,0.038584,0.165907,0.145207,0.285146,0.302964,0.167705,0.000000,...,0.059167,0.487271,0.102512,0.000000,0.000000,0.182261,0.858717,0.000000,0.000000,0.240017
Merged-chr16-87636040-3,0.000000,0.643352,0.310815,0.500128,0.313485,0.297701,0.000000,0.081818,0.226519,0.574300,...,0.063263,0.312461,0.274590,0.000000,0.000000,0.274571,0.858717,0.000000,0.000000,0.208474
Merged-chr7-98535313-3,0.058601,0.243004,0.000000,0.000000,0.000000,0.101154,0.113067,0.307878,0.270629,0.251354,...,0.231218,0.164964,0.084953,0.000000,0.103829,0.226108,0.678447,0.000000,0.000000,0.255790
Merged-chr13-67484493-3,0.170606,0.243004,0.024524,0.635408,0.382934,0.290924,0.475662,0.317707,0.211815,0.615179,...,0.227122,0.230518,0.264055,0.052839,0.000000,0.046104,0.621088,0.000000,0.000000,0.000000
Merged-chr3-90604533-3,0.000000,0.530974,0.000000,0.834350,0.387275,0.104542,0.113067,0.067075,0.589210,0.000000,...,0.198446,0.208667,0.527439,0.038837,0.000000,0.332264,0.694836,0.000000,0.009819,0.429281
Merged-chr17-26676427-3,0.000000,0.250028,0.000000,0.778647,0.000000,0.000000,0.303583,0.656800,0.285334,0.000000,...,0.292665,0.678472,0.351850,0.038837,0.103829,0.426881,0.858717,0.000000,0.294979,0.342535
Merged-chr1-97769642-3,0.263009,0.000000,0.000000,0.452382,0.000000,0.223149,0.899713,0.524110,0.309839,0.000000,...,0.075553,0.252369,0.334291,0.185860,0.000000,0.156875,0.858717,0.000000,0.039319,0.393794


## Make Truth Table

In [21]:
motif_1 = 'spi1-c'
motif_2 = 'cebp'
factor_1 = 'c57bl6_pu1_veh'
factor_2 = 'c57bl6_cebpa_veh'

motif_state_1s = []
motif_state_2s = []
peak_state_1s = []
peak_state_2s = []
snp_state_1s = []
snp_state_2s = []
counts = []
for peak_state_1 in [True, False]:
    for peak_state_2 in [True, False]:
        for motif_state_1 in [True, False]:
            for motif_state_2 in [True, False]:
                motif_state_1s.append(motif_state_1)
                motif_state_2s.append(motif_state_2)
                peak_state_1s.append(peak_state_1)
                peak_state_2s.append(peak_state_2)

                peak_indices = summary_frame[(summary_frame['Factors'].str.contains(factor_1) == peak_state_1) &
                                             (summary_frame['Factors'].str.contains(factor_2) == peak_state_2)].index.values

                current_motif_frame = motif_score_frame[motif_score_frame.index.isin(peak_indices)]
                motif_indices = current_motif_frame[((current_motif_frame[motif_1] > 0) == motif_state_1) &
                                                    ((current_motif_frame[motif_2] > 0) == motif_state_2)].index.values                   

                count = len(motif_indices)
                counts.append(count)
current_count_frame = pd.DataFrame({factor_1 +' binding':peak_state_1s,
                                   factor_2 +' binding':peak_state_2s,
                                   motif_1 + ' motif':motif_state_1s,
                                   motif_2 + ' motif':motif_state_2s,

                                   'num peaks': counts})               
current_count_frame = current_count_frame[[factor_1 +' binding',
                                   factor_2 +' binding',
                                   motif_1 + ' motif',
                                   motif_2 + ' motif',
                                                                      'num peaks']]

In [23]:
current_count_frame.to_csv('cebpa_pu1_truthTable.tsv', sep='\t')