In [1]:
import pandas as pd
import numpy as np
import pyBigWig
import glob

## Setup

Cell types of Interest:
- CLAGL
- D1MSN
- D2MSN
- ITL6GL
- D1MSN + D2MSN
- NPGL
- PTGL
- PVGA
- SSTGA
- VIPGA

In [3]:
cell_types = ['CLAGL', 'D1MSN', 'D2MSN', 'ITL6GL', 'D1MSN', 'D2MSN', 'NPGL', 'PTGL', 'PVGA', 'SSTGA', 'VIPGA']

In [4]:
peaks = glob.glob('/oak/stanford/groups/akundaje/projects/aav/narrowpeaks/*')

In [5]:
def get_beds_path(cell_types, peaks):
    beds = {}
    for ct in cell_types:
        beds[ct] = [peak for peak in peaks if ct in peak]
    return beds

In [6]:
beds = get_beds_path(cell_types, peaks)
print("Cell type       #")
for bed in beds.keys():
    print(f"{bed}\t\t{len(beds[bed])}")

Cell type       #
CLAGL		3
D1MSN		5
D2MSN		4
ITL6GL		6
NPGL		5
PTGL		8
PVGA		7
SSTGA		10
VIPGA		4


## Merge Peak files

In [None]:
def merge_peaks(path_list):
    dfs = []
    for bed in path_list:
        dfs.append(pd.read_csv(bed, delimiter = "\t", names = ["chrom", "start", "end", "i"]))
    df = pd.concat(dfs, axis = 0)
    df = df.groupby(by = "i", as_index = False).first().sort_values(['chrom', 'start'])
    return df

In [None]:
for bed in beds.keys():
    print(f"-------------Current peakfile: {bed}-------------")
    merged_peaks = merge_peaks(beds[bed])
    merged_peaks.to_csv('/oak/stanford/groups/akundaje/projects/aav/merged_peaks/'+bed+'.bed', \
                        columns = ["chrom", "start", "end", "i"], sep = '\t', header = False, index = False)

## Convert bed -> narrowPeak

In [None]:
def bed_to_narrowpeak(df): 
    for i in range(4, 8):
        df[i] = '.'
    #get max in corresponding bw as qValue
    bw_path = peakfile[:43]+"bigwigs/"+peakfile.split('/')[-1].split('.')[0]+".bw"
    print(f"----------Doing {bw_path}--------")
    bw = pyBigWig.open(bw_path)
    qvalues = []
    for row in df.iterrows():
        chr, start, stop = row[1].iloc[0], row[1].iloc[1], row[1].iloc[2]
        qvalues.append(max(bw.values(chr, start, stop)))
    df[8] = qvalues
    #summit in the center w. offset
    df[9] = (df[2] - df[1])//2
    return df

In [None]:
for bed in beds.keys():
    print(f"-------------Current cell type: {bed}-------------")
    for peakfile in beds[bed]:
        print(f"-------------Current peakfile: {peakfile}  ")
        df = pd.read_csv(peakfile, delimiter = "\t", names = [0, 1, 2, 3])
        df = bed_to_narrowpeak(df) 
        o_dir = peakfile[:43]+'merged_narropeaks/'+peakfile.split('/')[-1].split('.')[0]+'.bed'
        df.to_csv(o_dir, sep = '\t', header = False, index = False)

## Merge narrowPeak files

In [8]:
def merge_peaks(path_list):
    dfs = []
    for bed in path_list:
        dfs.append(pd.read_csv(bed, delimiter = "\t", names = ["chrom", "start", "end", "i", "u1", "u2", "u3", "u4", "qvalue", "summit"]))
    df = pd.concat(dfs, axis = 0)
    df = df.groupby(by = "i", as_index = False).first().sort_values(['chrom', 'start'])
    return df

In [9]:
for bed in beds.keys():
    print(f"-------------Current peakfile: {bed}-------------")
    merged_peaks = merge_peaks(beds[bed])
    merged_peaks.to_csv('/oak/stanford/groups/akundaje/projects/aav/merged_narrowpeaks/'+bed+'.bed', \
                        columns = ["chrom", "start", "end", "i", "u1", "u2", "u3", "u4", "qvalue", "summit"], sep = '\t', header = False, index = False)

-------------Current peakfile: CLAGL-------------
-------------Current peakfile: D1MSN-------------
-------------Current peakfile: D2MSN-------------
-------------Current peakfile: ITL6GL-------------
-------------Current peakfile: NPGL-------------
-------------Current peakfile: PTGL-------------
-------------Current peakfile: PVGA-------------
-------------Current peakfile: SSTGA-------------
-------------Current peakfile: VIPGA-------------
