In [18]:
import numpy as np
import pandas as pd

from os import listdir
import pysam


In [None]:
#Generate bam indexes
%%bash
cd 4.q20Files.bacteria/
parallel samtools index ::: *.bam

In [114]:
def getReadsMatchingContaminats(contaminants,sample):
    folder = "4.q20Files.bacteria/"
    readsContaminants = []
    for bam in listdir(folder):
        if sample in bam and bam[-3:]=="bam":
            bamFile = pysam.AlignmentFile(folder+bam, "rb")
            for reference in (set(contaminants)&set(bamFile.references)):            
                for read in bamFile.fetch(reference):
                    readsContaminants.append(read.qname) 
            bamFile.close()
    return readsContaminants

In [115]:
#parse matrix to a dict of prophage regions (values) per reference (key)
def parseRegionsMatrix(prophages):
    prophagesDict = {}
    for index,row in prophages.iterrows():
        regions = row[row>0]
        if regions.shape[0] > 0:
            prophagesDict[regions.name] = list(regions.index)
    return prophagesDict

In [116]:
def getReadsMatchingProphagesRegions(sample,prophagesDict):
    folder = "4.q20Files.bacteria/"
    readsProphages = []
    for bam in listdir(folder):
        if sample in bam and bam[-3:]=="bam":
            bamFile = pysam.AlignmentFile(folder+bam,"rb")
            for reference in (set(prophagesDict.keys()) & set(bamFile.references)):
                for region in prophagesDict[reference]:
                    regionEnd = int(region)*100000
                    regionStart = regionEnd - 100000
                    for read in bamFile.fetch(reference,start=regionStart,end=regionEnd):
                        readsProphages.append(read.qname)
            bamFile.close()
    return readsProphages

In [118]:
folder = "8.contaminants.bacteria/"
for fileName in listdir(folder):
    if "contaminants.names.csv" in fileName:
        sample = fileName.replace("contaminants","").split(".")[0]
        
        contaminants = list(pd.read_csv(folder+fileName,header=None)[0])
        readsContaminants = getReadsMatchingContaminats(contaminants,sample)

        prophages = pd.read_csv(folder+sample+"prophages.csv",header=0,index_col=0,sep='\t')
        prophagesDict = parseRegionsMatrix(prophages)
        readsProphages = getReadsMatchingProphagesRegions(sample,prophagesDict)

        readsContaminants = np.array(list(set(readsContaminants) - set(readsProphages)))
        readsProphages = np.array(readsProphages)

        np.savetxt(sample+".reads.contaminants.csv",readsContaminants,delimiter='\n',fmt='%s')
        np.savetxt(sample+".reads.prophages.csv",readsProphages,delimiter='\n',fmt='%s')

In [119]:
%%bash
mkdir 9.readsForDepletion.bacteria
mv *.reads.contaminants.csv 9.readsForDepletion.bacteria
mv *.reads.prophages.csv 9.readsForDepletion.bacteria

In [None]:
picard-tools FilterSamReads I=Ab1.bam O=Ab1.clean.bam READ_LIST_FILE=../2_readsCleaning/Ab1.reads.test.txt FILTER=excludeReadList