# AP-1 FDR Analysis

## Imports

In [None]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set_context('talk')

In [None]:
workingDirectory = '/home/jtao/analysis/ap1_fdr_analysis/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)
peakDirectory = '/home/jtao/analysis/ap1_fdr_analysis/peak_files/'
tagDirPath = '/home/jtao/analysis/cobinding_motif_analysis/tag_directories_ap1/'

In [None]:
# file directories

os.chdir(workingDirectory)

# read in and parse name mapping file and create a DataFrame representation

strains = []
cellTypes = []
experiments = []
factors = []
treatments = []
owners = []
dates = []
sampleNames = []

for sample in sorted(os.listdir(tagDirPath)):
    metaDataTokens = sample.strip().split("_")
    
    sampleNames.append(sample)
    
    # grab metadata from string tokens
    strain = metaDataTokens[0]
    cellType = metaDataTokens[1]
    experiment = metaDataTokens[2]
    factor = metaDataTokens[3]
    treatment = metaDataTokens[4]
    owner = metaDataTokens[5]
    date = metaDataTokens[6]
    
    # append to overall lists
    strains.append(strain)
    cellTypes.append(cellType)
    experiments.append(experiment)
    factors.append(factor)
    treatments.append(treatment)
    owners.append(owner)
    dates.append(date)
    
metadata_frame = pd.DataFrame([strains, 
                                  cellTypes, 
                                  experiments, 
                                  factors, treatments, 
                                  owners, 
                                  dates, 
                                  sampleNames]).transpose()
metadata_frame.columns = ['strain', 
                         'cellType', 
                         'experiment', 
                         'factor', 
                         'treatment', 
                         'owner', 
                         'date', 
                         'sampleName']

simpleTreatments = []
for t in list(metadata_frame['treatment'].values):
    st = t.split('-')[0]
    simpleTreatments.append(st)
metadata_frame['simpleTreatment'] = simpleTreatments

### read in log files ###

# create data frame summarizing mapping quality logs
_sampleNames = []
_totalReads = []
_unpairedReads = []
_unmappedReads = []
_uniquelyMappedReads = []
_multiMappedReads = []
_tagsPerPosition = []
for sample in [x for x in sorted(os.listdir(tagDirPath)) if not 'nput' in x and not 'p65' in x and not 'cebp' in x]:
    logFile = [x for x in os.listdir(tagDirPath + '/' + sample) if '.log'in x][0] # find mapping log file
    
    with open(tagDirPath + '/' + sample + '/' + logFile) as f:
        data = f.readlines()
            
    totalReads = float(data[0].split()[0])
    unpairedReads = float(data[1].split()[0])
    unmappedReads = float(data[2].split()[0])
    uniquelyMappedReads = float(data[3].split()[0])
    multiMappedReads = float(data[4].split()[0])

    _sampleNames.append(sample)
    _totalReads.append(totalReads)
    _unpairedReads.append(unpairedReads)
    _unmappedReads.append(unmappedReads)
    _uniquelyMappedReads.append(uniquelyMappedReads)
    _multiMappedReads.append(multiMappedReads)

for tagdir in os.listdir(tagDirPath):
    with open(tagDirPath + '/' + tagdir + '/tagInfo.txt') as f:
        data = f.readlines()
        tpp = float(data[5].strip().split('=')[1])
        _tagsPerPosition.append(tpp)
mappingStats_frame = pd.DataFrame([_sampleNames,
                                   _totalReads, 
                                   _unpairedReads, 
                                   _unmappedReads, 
                                   _uniquelyMappedReads, 
                                   _multiMappedReads,
                                   _tagsPerPosition]).transpose()
mappingStats_frame.columns = ['sampleName',
                              'totalReads', 
                              'unpairedReads', 
                              'unmappedReads', 
                              'uniquelyMappedReads', 
                              'multiMappedReads',
                              'tagsPerPosition']

# calculate fractions from read counts
mappingStats_frame['uniquelyMappedFraction'] = mappingStats_frame['uniquelyMappedReads'] / mappingStats_frame['totalReads']
mappingStats_frame['mappedFraction'] = (mappingStats_frame['uniquelyMappedReads'] + mappingStats_frame['multiMappedReads']) / mappingStats_frame['totalReads']



summary_frame = metadata_frame.merge(mappingStats_frame, on='sampleName')
summary_frame.index  = pd.MultiIndex.from_arrays([list(summary_frame['strain'].values), list(summary_frame['factor'].values), list(summary_frame['simpleTreatment'].values)])
mapping_summary_frame = summary_frame.sort()

In [None]:
# filter samples according to threshold for the fraction of uniquely mapped reads
mappedFractionThreshold = 0.0
uniquelyMappedReadThreshold = 1000000

filtered_summary_frame = mapping_summary_frame.copy()
# filter on fraction of mapped reads
filtered_summary_frame = filtered_summary_frame[filtered_summary_frame['mappedFraction'] >= mappedFractionThreshold]
# filter on total mapped reads
filtered_summary_frame = filtered_summary_frame[filtered_summary_frame['uniquelyMappedReads'] >= uniquelyMappedReadThreshold]

# get samples that were discarded
discardedSampleNames = [x for x in summary_frame['sampleName'].values if not x in filtered_summary_frame['sampleName'].values]
discarded_summary_frame = summary_frame[summary_frame['sampleName'].isin(discardedSampleNames)]
print("Number of Samples:", summary_frame.shape[0])
print("Number of discarded samples:",discarded_summary_frame.shape[0])
print("Number of Samples remaining after filtering:", filtered_summary_frame.shape[0])

# generate simplified name for naming output files
factorTreatment_count_dict = {} #{factor-treatment:count}
simplifiedNames = []
for simpleNameRoot in list((filtered_summary_frame['strain'] 
                            + '_' + filtered_summary_frame['factor'] 
                            + '_' + filtered_summary_frame["simpleTreatment"]
                            + '_' + filtered_summary_frame['date']).values):
    if not simpleNameRoot in factorTreatment_count_dict:
        factorTreatment_count_dict[simpleNameRoot] = 1
    else:
        factorTreatment_count_dict[simpleNameRoot] += 1
    simplifiedName = (simpleNameRoot + '_' + str(factorTreatment_count_dict[simpleNameRoot])).lower()
    simplifiedNames.append(simplifiedName)

filtered_summary_frame["simplifiedName"] = simplifiedNames

originalName_simpleName_dict = dict(zip(filtered_summary_frame['sampleName'].values,
                                       filtered_summary_frame['simplifiedName'].values))
simpleName_originalName_dict = dict(zip(filtered_summary_frame['simplifiedName'].values,
                                       filtered_summary_frame['sampleName'].values))

## Peak Calling

In [None]:
# %%capture 
# suppress output - this can be saved to a variable (like a log file)

### call peaks ###
# iterate through each individual file
if not os.path.exists(peakDirectory):
    os.makedirs(peakDirectory)

# make peak files with simplified names
# filteredSamples = list(filtered_mappingStats_frame['sampleName'].values)
scriptFile = open('./peakCalling_homer.sh', 'w')


for tagDir in filtered_summary_frame['sampleName'].values:
    # call peaks only for experiments that passed thresholding
    metaDataTokens = tagDir.split("_")
    treatment = metaDataTokens[4]

    peakFileName = originalName_simpleName_dict[tagDir] + "_default_peaks.tsv"

    if "veh" in treatment.lower():
        inputDir = '/home/jtao/analysis/ap1_analysis/input_data/C57Bl6_Thiomac_ChIP_Input_Veh_GJF_15-03-20'
    elif "kla" in treatment.lower():
        inputDir = '/home/jtao/analysis/ap1_analysis/input_data/C57Bl6_Thiomac_ChIP_Input_KLA-1h_GJF_15-03-20'
    
    scriptFile.write('findPeaks ' + tagDirPath + '/' + tagDir + 
                 ' -i ' + inputDir
                 + ' -style factor -size 200 -norm 1e6 > ' + 
                 peakDirectory +'/' + peakFileName + ' &\n')
scriptFile.close()

In [None]:
%%bash
rm ./peak_files/*
chmod a+x ./*sh
bash ./peakCalling_homer.sh

## Copy IDR Peak files

In [None]:
%%bash
if [ ! -d ./idr_peak_files ]; then mkdir ./idr_peak_files; else rm ./idr_peak_files/*; fi
cp /gpfs/data01/glasslab/home/jtao/analysis/cobinding_motif_analysis/peak_files/c57bl6* ./idr_peak_files/
rm ./idr_peak_files/*p65*
rm ./idr_peak_files/*cebp*
for i in ./idr_peak_files/*;
    do mv $i ${i/_peaks.tsv/_idr_peaks.tsv};
done

## Peak Filtering

In [None]:
# filter away peaks that have less than 16 normalized tags
# rename peak files to remove redundant information
chroms = ['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX']
seen_conditions = set()
peakDirectory = './peak_files/'
filtered_peak_directory = './filtered_peak_files/'
if not os.path.isdir(filtered_peak_directory):
    os.mkdir(filtered_peak_directory)
for f in os.listdir(peakDirectory):
    print(f)
    tokens = f.split('_')
    condition = tokens[0] +'_'+ tokens[1] + '_' + tokens[2]
    
    if condition in seen_conditions:
        new_name = filtered_peak_directory+'/'+condition + '_rep2_peaks.tsv' 
    else:
        new_name = filtered_peak_directory+'/'+condition + '_rep1_peaks.tsv' 
        seen_conditions.add(condition)
    current_frame = pd.read_csv(peakDirectory + '/' + f, sep = '\t', skiprows=39)
    filtered_frame = current_frame[(current_frame['chr'].isin(chroms)) &
                                  (current_frame['Normalized Tag Count'] >= 1)]
    filtered_frame.to_csv(new_name, sep='\t', index=False)

In [None]:
# filter away peaks that have less than 16 normalized tags
# rename peak files to remove redundant information
chroms = ['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX']
seen_conditions = set()
filtered_peak_directory = './filtered_idr_peak_files/'
if not os.path.isdir(filtered_peak_directory):
    os.mkdir(filtered_peak_directory)
peakDirectory = './idr_peak_files/'
for f in os.listdir(peakDirectory):
    print(f)
    tokens = f.split('_')
    condition = tokens[0] +'_'+ tokens[1] + '_' + tokens[2]
    new_name = filtered_peak_directory+'/'+condition + '_idr_peaks.tsv' 
    current_frame = pd.read_csv(peakDirectory + '/' + f, sep = '\t')
    filtered_frame = current_frame[current_frame['chr'].isin(chroms)]
    filtered_frame.to_csv(new_name, sep='\t', index=False)

## Merge Peaks

## Convert to BED Files

In [None]:
%%bash
# individual replicates
if [ ! -d ./bed_files ]; then mkdir ./bed_files/; else rm ./bed_files/*; fi

for i in ./filtered_peak_files/*tsv;
    do echo $i;
    outpath=./bed_files/${i##*/}
    outpath=${outpath/_peaks.tsv/.bed}
    echo $outpath
    pos2bed.pl $i > ./tmp
    tail -n +2 ./tmp > $outpath
done

# idr peaks
for i in ./filtered_idr_peak_files/*tsv;
    do echo $i;
    outpath=./bed_files/${i##*/}
    outpath=${outpath/_peaks.tsv/.bed}
    echo $outpath
    pos2bed.pl $i > ./tmp
    tail -n +2 ./tmp > $outpath
done
rm tmp



## Extract Sequences

In [None]:
%%bash
if [ ! -d ./fasta_files ]; then mkdir ./fasta_files/; else rm ./fasta_files/*; fi

for i in ./bed_files/*bed; 
    do echo $i;
    outpath=./fasta_files/${i##*/}
    outpath=${outpath/.bed/.fasta}
    /gpfs/data01/glasslab/home/jtao/code/tba/extract_sequences.py $i mm10 $outpath
done

## Construct Background

In [None]:
%%bash
rm ./make_background.sh
script_path="./make_background.sh"
if [ ! -d ./background/ ]; then mkdir ./background/ ; fi
for i in ./bed_files/*bed;
do 
    factor=${i##*/};
    factor=${factor%.bed};

    fasta_path="./background/${factor}_background.fasta"
    bed_path="./background/${factor}_background.bed"
    if [ ! -f $fasta_path ];
    then
    echo "/home/jtao/code/tba/generate_background_coordinates.py $i ./background/ -genome mm10";
    echo "mv ./background/background.bed $bed_path";
    echo "mv ./background/background.fasta $fasta_path";
    echo "/home/jtao/code/tba/generate_background_coordinates.py $i ./background/ -genome mm10" >> $script_path;
    echo "mv ./background/background.bed $bed_path" >> $script_path;
    echo "mv ./background/background.fasta $fasta_path" >> $script_path;
    fi
done

In [None]:
%%bash
rm ./background/*
chmod a+x ./*sh
bash ./make_background.sh

## Create Features

In [95]:
%%bash
positive_seq_dir="./fasta_files/"
negative_seq_dir="./background/"
out_dir="./tba_output/"
script_path='./calculate_features.sh'
motif_dir='./jaspar_2016_curated_homerFormat/'
if [ -f $script_path ]; then rm $script_path; else touch $script_path; fi
if [ ! -d $out_dir ]; then mkdir $out_dir; fi

for positive_seq_path in $positive_seq_dir/*;
    do factor=${positive_seq_path##./*/};
    factor=${factor%.fasta};

    negative_seq_path=${negative_seq_dir}/${factor}_background.fasta;
    echo "python /home/jtao/code/tba/create_features.py $positive_seq_path $negative_seq_path $out_dir ${motif_dir}/* -num_proc 28" >>$script_path
    echo "mv $out_dir/labels.txt $out_dir/${factor}_labels.txt" >> $script_path
    echo "mv $out_dir/standardized_features.tsv $out_dir/${factor}_standardized_features.tsv" >> $script_path
done

## Train Models

## Compare Models