# AP-1 FDR Analysis

## Imports

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set_context('talk')

In [14]:
workingDirectory = '/home/jtao/analysis/ap1_fdr_analysis/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)
peakDirectory = '/home/jtao/analysis/ap1_fdr_analysis/peak_files/'
tagDirPath = '/home/jtao/analysis/cobinding_motif_analysis/tag_directories_ap1/'

In [45]:
# file directories

os.chdir(workingDirectory)

# read in and parse name mapping file and create a DataFrame representation

strains = []
cellTypes = []
experiments = []
factors = []
treatments = []
owners = []
dates = []
sampleNames = []

for sample in sorted(os.listdir(tagDirPath)):
    metaDataTokens = sample.strip().split("_")
    
    sampleNames.append(sample)
    
    # grab metadata from string tokens
    strain = metaDataTokens[0]
    cellType = metaDataTokens[1]
    experiment = metaDataTokens[2]
    factor = metaDataTokens[3]
    treatment = metaDataTokens[4]
    owner = metaDataTokens[5]
    date = metaDataTokens[6]
    
    # append to overall lists
    strains.append(strain)
    cellTypes.append(cellType)
    experiments.append(experiment)
    factors.append(factor)
    treatments.append(treatment)
    owners.append(owner)
    dates.append(date)
    
metadata_frame = pd.DataFrame([strains, 
                                  cellTypes, 
                                  experiments, 
                                  factors, treatments, 
                                  owners, 
                                  dates, 
                                  sampleNames]).transpose()
metadata_frame.columns = ['strain', 
                         'cellType', 
                         'experiment', 
                         'factor', 
                         'treatment', 
                         'owner', 
                         'date', 
                         'sampleName']

simpleTreatments = []
for t in list(metadata_frame['treatment'].values):
    st = t.split('-')[0]
    simpleTreatments.append(st)
metadata_frame['simpleTreatment'] = simpleTreatments

### read in log files ###

# create data frame summarizing mapping quality logs
_sampleNames = []
_totalReads = []
_unpairedReads = []
_unmappedReads = []
_uniquelyMappedReads = []
_multiMappedReads = []
_tagsPerPosition = []
for sample in [x for x in sorted(os.listdir(tagDirPath)) if not 'nput' in x and not 'p65' in x and not 'cebp' in x]:
    logFile = [x for x in os.listdir(tagDirPath + '/' + sample) if '.log'in x][0] # find mapping log file
    
    with open(tagDirPath + '/' + sample + '/' + logFile) as f:
        data = f.readlines()
            
    totalReads = float(data[0].split()[0])
    unpairedReads = float(data[1].split()[0])
    unmappedReads = float(data[2].split()[0])
    uniquelyMappedReads = float(data[3].split()[0])
    multiMappedReads = float(data[4].split()[0])

    _sampleNames.append(sample)
    _totalReads.append(totalReads)
    _unpairedReads.append(unpairedReads)
    _unmappedReads.append(unmappedReads)
    _uniquelyMappedReads.append(uniquelyMappedReads)
    _multiMappedReads.append(multiMappedReads)

for tagdir in os.listdir(tagDirPath):
    with open(tagDirPath + '/' + tagdir + '/tagInfo.txt') as f:
        data = f.readlines()
        tpp = float(data[5].strip().split('=')[1])
        _tagsPerPosition.append(tpp)
mappingStats_frame = pd.DataFrame([_sampleNames,
                                   _totalReads, 
                                   _unpairedReads, 
                                   _unmappedReads, 
                                   _uniquelyMappedReads, 
                                   _multiMappedReads,
                                   _tagsPerPosition]).transpose()
mappingStats_frame.columns = ['sampleName',
                              'totalReads', 
                              'unpairedReads', 
                              'unmappedReads', 
                              'uniquelyMappedReads', 
                              'multiMappedReads',
                              'tagsPerPosition']

# calculate fractions from read counts
mappingStats_frame['uniquelyMappedFraction'] = mappingStats_frame['uniquelyMappedReads'] / mappingStats_frame['totalReads']
mappingStats_frame['mappedFraction'] = (mappingStats_frame['uniquelyMappedReads'] + mappingStats_frame['multiMappedReads']) / mappingStats_frame['totalReads']



summary_frame = metadata_frame.merge(mappingStats_frame, on='sampleName')
summary_frame.index  = pd.MultiIndex.from_arrays([list(summary_frame['strain'].values), list(summary_frame['factor'].values), list(summary_frame['simpleTreatment'].values)])
mapping_summary_frame = summary_frame.sort()



In [26]:
# filter samples according to threshold for the fraction of uniquely mapped reads
mappedFractionThreshold = 0.0
uniquelyMappedReadThreshold = 1000000

filtered_summary_frame = mapping_summary_frame.copy()
# filter on fraction of mapped reads
filtered_summary_frame = filtered_summary_frame[filtered_summary_frame['mappedFraction'] >= mappedFractionThreshold]
# filter on total mapped reads
filtered_summary_frame = filtered_summary_frame[filtered_summary_frame['uniquelyMappedReads'] >= uniquelyMappedReadThreshold]

# get samples that were discarded
discardedSampleNames = [x for x in summary_frame['sampleName'].values if not x in filtered_summary_frame['sampleName'].values]
discarded_summary_frame = summary_frame[summary_frame['sampleName'].isin(discardedSampleNames)]
print("Number of Samples:", summary_frame.shape[0])
print("Number of discarded samples:",discarded_summary_frame.shape[0])
print("Number of Samples remaining after filtering:", filtered_summary_frame.shape[0])

# generate simplified name for naming output files
factorTreatment_count_dict = {} #{factor-treatment:count}
simplifiedNames = []
for simpleNameRoot in list((filtered_summary_frame['strain'] 
                            + '_' + filtered_summary_frame['factor'] 
                            + '_' + filtered_summary_frame["simpleTreatment"]
                            + '_' + filtered_summary_frame['date']).values):
    if not simpleNameRoot in factorTreatment_count_dict:
        factorTreatment_count_dict[simpleNameRoot] = 1
    else:
        factorTreatment_count_dict[simpleNameRoot] += 1
    simplifiedName = (simpleNameRoot + '_' + str(factorTreatment_count_dict[simpleNameRoot])).lower()
    simplifiedNames.append(simplifiedName)

filtered_summary_frame["simplifiedName"] = simplifiedNames

originalName_simpleName_dict = dict(zip(filtered_summary_frame['sampleName'].values,
                                       filtered_summary_frame['simplifiedName'].values))
simpleName_originalName_dict = dict(zip(filtered_summary_frame['simplifiedName'].values,
                                       filtered_summary_frame['sampleName'].values))

Number of Samples: 36
Number of discarded samples: 0
Number of Samples remaining after filtering: 36


## Peak Calling

In [41]:
# %%capture 
# suppress output - this can be saved to a variable (like a log file)

### call peaks ###
# iterate through each individual file
if not os.path.exists(peakDirectory):
    os.makedirs(peakDirectory)

# make peak files with simplified names
# filteredSamples = list(filtered_mappingStats_frame['sampleName'].values)
scriptFile = open('./peakCalling_homer.sh', 'w')


for tagDir in filtered_summary_frame['sampleName'].values:
    # call peaks only for experiments that passed thresholding
    metaDataTokens = tagDir.split("_")
    treatment = metaDataTokens[4]

    peakFileName = originalName_simpleName_dict[tagDir] + "_default_peaks.tsv"

    if "veh" in treatment.lower():
        inputDir = '/home/jtao/analysis/ap1_analysis/input_data/C57Bl6_Thiomac_ChIP_Input_Veh_GJF_15-03-20'
    elif "kla" in treatment.lower():
        inputDir = '/home/jtao/analysis/ap1_analysis/input_data/C57Bl6_Thiomac_ChIP_Input_KLA-1h_GJF_15-03-20'
    
    scriptFile.write('findPeaks ' + tagDirPath + '/' + tagDir + 
                 ' -i ' + inputDir
                 + ' -style factor -size 200 -norm 1e6 > ' + 
                 peakDirectory +'/' + peakFileName + ' &\n')
scriptFile.close()

In [None]:
%%bash
rm ./peak_files/*
chmod a+x ./*sh
bash ./peakCalling_homer.sh

## Peak Filtering

In [48]:
# filter away peaks that have less than 16 normalized tags
# rename peak files to remove redundant information
chroms = ['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX']
seen_conditions = set()
filtered_peak_directory = './filtered_peak_files/'
if not os.path.isdir(filtered_peak_directory):
    os.mkdir(filtered_peak_directory)
for f in os.listdir(peakDirectory):
    print(f)
    tokens = f.split('_')
    condition = tokens[0] +'_'+ tokens[1] + '_' + tokens[2]
    
    if condition in seen_conditions:
        new_name = filtered_peak_directory+'/'+condition + '_rep2_peaks.tsv' 
    else:
        new_name = filtered_peak_directory+'/'+condition + '_rep1_peaks.tsv' 
        seen_conditions.add(condition)
    current_frame = pd.read_csv(peakDirectory + '/' + f, sep = '\t', skiprows=39)
    filtered_frame = current_frame[(current_frame['chr'].isin(chroms)) &
                                  (current_frame['Normalized Tag Count'] >= 1)]
    filtered_frame.to_csv(new_name, sep='\t', index=False)

c57bl6_atf3_kla_16-04-21_1_default_peaks.tsv
c57bl6_jund_kla_16-04-21_1_default_peaks.tsv
c57bl6_cjun_veh_16-04-21_1_default_peaks.tsv
c57bl6_fos_kla_14-03-17_1_default_peaks.tsv
c57bl6_fra2_veh_14-03-17_1_default_peaks.tsv
c57bl6_atf3_kla_16-08-16_1_default_peaks.tsv
c57bl6_jund_veh_16-07-23_1_default_peaks.tsv
c57bl6_fos_kla_15-02-06_1_default_peaks.tsv
c57bl6_jund_kla_15-11-18_1_default_peaks.tsv
c57bl6_pu1_kla_11-05-12_1_default_peaks.tsv
c57bl6_atf3_veh_16-07-23_1_default_peaks.tsv
c57bl6_fos_veh_14-03-17_1_default_peaks.tsv
c57bl6_cjun_veh_16-06-14_1_default_peaks.tsv
c57bl6_jund_veh_16-04-21_1_default_peaks.tsv
c57bl6_cjun_kla_16-04-21_1_default_peaks.tsv
c57bl6_junb_kla_14-03-17_1_default_peaks.tsv
c57bl6_fra2_kla_16-04-21_1_default_peaks.tsv
c57bl6_junb_kla_15-02-06_1_default_peaks.tsv
c57bl6_junb_veh_15-02-06_1_default_peaks.tsv
c57bl6_pu1_veh_11-05-12_1_default_peaks.tsv
c57bl6_fos_veh_15-02-06_1_default_peaks.tsv
c57bl6_pu1_veh_16-06-14_1_default_peaks.tsv
c57bl6_fra2_kla_1

## Copy IDR Peak files

In [49]:
%%bash
if [ ! -d ./idr_peak_files ]; then mkdir ./idr_peak_files; else rm ./idr_peak_files/*; fi
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/peak_files/c57bl6* ./idr_peak_files/
rm ./idr_peak_files/*p65*
rm ./idr_peak_files/*cebp*

## Merge Peaks

In [50]:
!ls ./idr_peak_files/

c57bl6_atf3_kla_peaks.tsv  c57bl6_fos_veh_peaks.tsv   c57bl6_jund_kla_peaks.tsv
c57bl6_atf3_veh_peaks.tsv  c57bl6_fra2_kla_peaks.tsv  c57bl6_jund_veh_peaks.tsv
c57bl6_cjun_kla_peaks.tsv  c57bl6_fra2_veh_peaks.tsv  c57bl6_pu1_kla_peaks.tsv
c57bl6_cjun_veh_peaks.tsv  c57bl6_junb_kla_peaks.tsv  c57bl6_pu1_veh_peaks.tsv
c57bl6_fos_kla_peaks.tsv   c57bl6_junb_veh_peaks.tsv


In [51]:
!ls ./filtered_peak_files/

c57bl6_atf3_kla_rep1_peaks.tsv	c57bl6_fra2_veh_rep1_peaks.tsv
c57bl6_atf3_kla_rep2_peaks.tsv	c57bl6_fra2_veh_rep2_peaks.tsv
c57bl6_atf3_veh_rep1_peaks.tsv	c57bl6_junb_kla_rep1_peaks.tsv
c57bl6_atf3_veh_rep2_peaks.tsv	c57bl6_junb_kla_rep2_peaks.tsv
c57bl6_cjun_kla_rep1_peaks.tsv	c57bl6_junb_veh_rep1_peaks.tsv
c57bl6_cjun_kla_rep2_peaks.tsv	c57bl6_junb_veh_rep2_peaks.tsv
c57bl6_cjun_veh_rep1_peaks.tsv	c57bl6_jund_kla_rep1_peaks.tsv
c57bl6_cjun_veh_rep2_peaks.tsv	c57bl6_jund_kla_rep2_peaks.tsv
c57bl6_fos_kla_rep1_peaks.tsv	c57bl6_jund_veh_rep1_peaks.tsv
c57bl6_fos_kla_rep2_peaks.tsv	c57bl6_jund_veh_rep2_peaks.tsv
c57bl6_fos_veh_rep1_peaks.tsv	c57bl6_pu1_kla_rep1_peaks.tsv
c57bl6_fos_veh_rep2_peaks.tsv	c57bl6_pu1_kla_rep2_peaks.tsv
c57bl6_fra2_kla_rep1_peaks.tsv	c57bl6_pu1_veh_rep1_peaks.tsv
c57bl6_fra2_kla_rep2_peaks.tsv	c57bl6_pu1_veh_rep2_peaks.tsv


## Generate Backgrounds

## Calculate Motif Scores

## Train Models

## Compare Models

In [None]:
for idr_f in os.listdir('./idr_peak_files/')