# Signals Analysis

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import threading
import matplotlib.pyplot as plt 
import seaborn as sns
from collections import Counter
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
import pickle
from sklearn import preprocessing
import time
sns.set_context('talk')

In [2]:
workingDirectory = '/home/jtao/analysis/signals_analysis/'
os.chdir(workingDirectory)

## Identify Acessible Promoters

In [3]:
! mergePeaks -d given -venn venn.txt ./resized_peak_files/c57bl6_kla-1h_peaks.tsv ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv ./resized_peak_files/c57bl6_veh_peaks.tsv > ./merged_atac_veh_ifng_kla_peaks.tsv

	Max distance to merge: direct overlap required (-d given)
	Merging peaks... 
	Comparing ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 total) and ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 total)
	Comparing ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 total) and ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv (30129 total)
	Comparing ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 total) and ./resized_peak_files/c57bl6_veh_peaks.tsv (29766 total)
	Comparing ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv (30129 total) and ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 total)
	Comparing ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv (30129 total) and ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv (30129 total)
	Comparing ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv (30129 total) and ./resized_peak_files/c57bl6_veh_peaks.tsv (29766 total)
	Comparing ./resized_peak_files/c57bl6_veh_peaks.tsv (29766 total) and ./resized_peak_files/c57bl6_kla-1h_peaks.tsv (32895 to

In [None]:
! annotatePeaks.pl ./merged_atac_veh_ifng_kla_peaks.tsv mm10 > ./annotated_atac_veh_ifng_kla_peaks.tsv

In [23]:
# read in annotated frame
annotated_frame = pd.read_csv('./annotated_atac_veh_ifng_kla_peaks.tsv', sep='\t')
annotated_frame.index = annotated_frame.ix[:,0]

# get promoter associated peaks and associated gene names
annotated_frame['Annotation'] = [x.split(' ')[0] for x in annotated_frame['Annotation'].astype(str)]
promoter_ids = annotated_frame[annotated_frame['Annotation'].str.contains('promoter')].ix[:,0].values

id_gene_dict = dict(zip(promoter_ids, annotated_frame.ix[promoter_ids,'Gene Name'].values))

In [31]:
merged_frame = pd.read_csv('./merged_atac_veh_ifng_kla_peaks.tsv', sep='\t')
merged_frame.index = merged_frame.ix[:,0].values

# filter away peaks in unused chromsomes
filtered_frame = merged_frame[
             ~(merged_frame['chr'].str.contains('chrY')) & 
             ~(merged_frame['chr'].str.contains('chrM')) &
             ~(merged_frame['chr'].str.contains('random')) &
             ~(merged_frame['chr'].str.contains('Un'))]

merged_frame = pd.read_csv('./merged_atac_peaks.tsv', sep='\t')

filtered_frame = filtered_frame[filtered_frame.index.isin(promoter_ids)]

# resize peaks
target_peak_size = 200
peakCenters = (filtered_frame['start'] + filtered_frame['end']) /2
int_peakCenters = np.ceil(peakCenters).astype(int)
new_starts = int_peakCenters - int(target_peak_size/2)
new_ends = int_peakCenters + int(target_peak_size/2)
filtered_frame['start'] = new_starts
filtered_frame['end'] = new_ends
filtered_frame.to_csv('./merged_atac_peaks_promoters.tsv', sep='\t', index=False)

In [33]:
! makeSummaryFile.py ./merged_atac_peaks_promoters.tsv ./group_atac_promoter_summary.tsv ./resized_peak_files/c57bl6_kla-1h_peaks.tsv ./resized_peak_files/c57bl6_ifng-1h_peaks.tsv ./resized_peak_files/c57bl6_veh_peaks.tsv

Finished reading merged peak file...
Integrating scores for c57bl6_kla-1h
Integrating scores for c57bl6_ifng-1h
Integrating scores for c57bl6_veh


In [34]:
# read in peak data data
summary_frame = pd.read_csv('./group_atac_promoter_summary.tsv' , sep='\t')
summary_frame = summary_frame.fillna('0')
for col in summary_frame.columns[5:]:
    floatValues = []
    for val in summary_frame[col].values.astype(str):
        if ',' in val:
            maxVal = np.mean([float(x) for x in val.split(',')])
            floatValues.append(maxVal)
        else:
            floatValues.append(float(val))
    summary_frame[col] = floatValues
summary_frame.index = summary_frame['ID'].values


## Calculate Motif Scores

In [49]:
%%bash
# extract sequences for analysis
pos2bed.pl ./merged_atac_peaks_promoters.tsv > ./tmp
tail -n +2 ./tmp > ./merged_atac_peaks_promoters.bed
rm tmp


	Converted 11276 peaks total



In [51]:
!/gpfs/data01/glasslab/home/jtao/code/tba/extract_sequences.py ./merged_atac_peaks_promoters.bed mm10 ./merged_atac_peaks_promoters.fasta

reading genome mm10


In [52]:
# calculate motif scores
!/gpfs/data01/glasslab/home/jtao/code/tba/calculate_all_motif_scores.py -num_procs 50 ./merged_atac_peaks_promoters.fasta ./motif_scores_promoters /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/jaspar_2016_curated_homerFormat/*motif 

arnt_mycn calculation time: 154.56569361686707
alx1_alx4_arx calculation time: 156.1815402507782
ar_nr3c1_nr3c2 calculation time: 156.1735589504242
bcl6b calculation time: 153.7626292705536
ap-1 calculation time: 157.42601561546326
arid3b calculation time: 160.17927384376526
arntl_mitf calculation time: 159.66860842704773
bcl6 calculation time: 158.16161274909973
atf7_batf3_creb5 calculation time: 159.45191478729248
ctcf calculation time: 155.91757535934448
creb3-l1 calculation time: 156.87389707565308
ewsr1-fli1 calculation time: 149.93628191947937
cdx calculation time: 158.00419235229492
bhlhe23 calculation time: 158.63525700569153
esr1 calculation time: 151.51000308990479
elk_etv calculation time: 151.9554738998413
bhlh calculation time: 159.9209144115448
dmbx1 calculation time: 156.35902786254883
arid5a calculation time: 163.18458771705627
cenpb calculation time: 159.456316947937
glis calculation time: 149.40900897979736
ascl2_nhlh1 calculation time: 163.51305603981018
arid3a calcu