In [3]:
import subprocess
import os
import pandas as pd
import numpy as np
import pysam
import matplotlib.pyplot as plt
CTCFbam = '/home/kal/CTCF/ATAC_CTCF/data/GM12878/wgEncodeBroadHistoneGm12878CtcfStdAlnRep2.bam'
DHSbam = 'wgEncodeDnaseKarpas422rep1_ENCFF783ONG.bam'
cellline = 'Karpas'
dirpath = os.path.dirname(CTCFbam)

In [4]:
# process CTCF
subprocess.run(['samtools', 'index', CTCFbam])
print('Indexed BAM')

# get peak calls
narrowpeaks = os.path.join(dirpath, cellline + '_CTCF_narrowpeaks.bed')
subprocess.run(['runMACS2.sh', '-i', CTCFbam, '-o', narrowpeaks])
print('Called Peaks')

# get read counts
countedpeaks = os.path.join(dirpath, cellline + '_CTCF_countedpeaks.bed') 
f = open(countedpeaks, 'w')
subprocess.run(['samtools', 'bedcov', narrowpeaks, CTCFbam], stdout=f)
print('Counted Peaks')

# filter the reads
bed = pd.read_table(countedpeaks, header=None)
bed.columns = 'chr start end name score dot fold_change pvalue qvalue summit sumcounts'.split()
bed['counts'] = bed.sumcounts/(bed.end - bed.start)
finalbed =  bed.filter(['chr', 'start', 'end', 'score', 'counts'])
finalbed = finalbed.loc[(finalbed['counts']>10)]
finalbed.to_csv(os.path.join(dirpath, cellline + '_CTCF_peaks.bed'), sep='\t', index=False, header=False)
print('Wrote File')

# remove the noncounted file
#os.remove(narrowpeaks)

Indexed BAM


KeyboardInterrupt: 

In [None]:
# plot scores and readcounts
plt.hist(bed['end'] - bed['start'])
plt.title('Distribution of Peak Widths')
plt.xlabel('End - Start of Peak')
plt.show()

plt.hist(np.log(bed['counts']))
plt.title('Distribution of Read Counts')
plt.xlabel('log Number of read counts')
plt.show()

plt.hist(np.log(finalbed['counts']))
plt.title('Filtered Distribution of Read Counts')
plt.xlabel('log Number of read counts')
plt.show()

plt.hist(np.log(bed['counts']))
plt.title('Distribution of Scores')
plt.xlabel('log Score assigned by MACS2')
plt.show()

plt.hexbin(np.log(bed['score']), np.log(bed['counts']), bins='log')
plt.title('Distribution of Scores and Counts')
plt.xlabel('log Score assigned by MACS2')
plt.ylabel('log Number of read counts')
plt.show()

In [2]:
# process DHS
subprocess.run(['samtools', 'index', DHSbam])
print('Indexed BAM')

# get peak calls
narrowpeaks = os.path.join(dirpath, cellline + '_DHS_narrowpeaks.bed')
subprocess.run(['runMACS2.sh', '-i', DHSbam, '-o', narrowpeaks])
print('Called Peaks')

# get read counts
countedpeaks = os.path.join(dirpath, cellline + '_DHS_countedpeaks.bed') 
f = open(countedpeaks, 'w')
subprocess.run(['samtools', 'bedcov', narrowpeaks, DHSbam], stdout=f)
print('Counted Peaks')

# filter the reads
bed = pd.read_table(countedpeaks, header=None)
bed.columns = 'chr start end name score dot fold_change pvalue qvalue summit sumcounts'.split()
bed['counts'] = bed.sumcounts/(bed.end - bed.start)
finalbed =  bed.filter(['chr', 'start', 'end', 'score', 'counts'])
finalbed = finalbed.loc[(finalbed['counts']>10)]
finalbed.to_csv(os.path.join(dirpath, cellline + '_DHS_peaks.bed'), sep='\t', index=False, header=False)
print('Wrote File')

# remove the noncounted file
os.remove(narrowpeaks)

Indexed BAM
Called Peaks
Counted Peaks


EmptyDataError: No columns to parse from file

In [None]:
# plot scores and readcounts
plt.hist(bed['end'] - bed['start'])
plt.title('Distribution of Peak Widths')
plt.xlabel('End - Start of Peak')
plt.show()

plt.hist(np.log(bed['counts']))
plt.title('Distribution of Read Counts')
plt.xlabel('log Number of read counts')
plt.show()

plt.hist(np.log(finalbed['counts']))
plt.title('Filtered Distribution of Read Counts')
plt.xlabel('log Number of read counts')
plt.show()

plt.hist(np.log(bed['counts']))
plt.title('Distribution of Scores')
plt.xlabel('log Score assigned by MACS2')
plt.show()

plt.hexbin(np.log(bed['score']), np.log(bed['counts']), bins='log')
plt.title('Distribution of Scores and Counts')
plt.xlabel('log Score assigned by MACS2')
plt.ylabel('log Number of read counts')
plt.show()