In [2]:
import os
import pandas as pd
import multiprocessing
import parmap
import datetime
from pathonoia import pathonoia as pa

In [30]:
# this folder will be created and for every dataset and sample corresponding subfolders
OUTPUT_DIR = "./OUTPUT/"

# The Inputfolder should contain one or more fasta files
# These files should be samples of the same dataset
# the fasta files should contain all unmapped reads after aligning the original sample to the host's transcriptome
# the unmapped reads can be extracted from the alignment BAM file with SAMtools:

# samtools view -hb -f 4 host_align.bam > host_unmapped.bam
# samtools view host_unmapped.bam | awk \'{OFS="\\t"; print ">"$1"\\n"$10}\' - > host_unmapped.fa

inputfolder = "./dataset/"

# make sure Kraken 2 is installed on your system along with its index containing the organisms of your choice and k=31
# krakenTOOL="path/to/kraken2 --db /path/to/Kraken2/db/kraken2_k31"
krakenTOOL="../tools/kraken2/kraken2"
krakenDB="../tools/kraken2/db/bacvir_k32"


In [5]:
samToolsCmdExtractUnaligned("prefix")

'samtools view -hb -f 4 prefix_staralign.bam > prefix_humanUnmapped.bam'

In [31]:
def checkKrakenInstall():
    if os.path.isfile(krakenTOOL) and os.path.isdir(krakenDB):
        return(True)
    else: 
        return(False)
    
checkKrakenInstall()

False

In [32]:
processDataset(inputfolder)

./dataset/
dataset
5 samples to process
Problem with Kraken Installation. ABORT


()

In [29]:
# run this cell, these are just definitions / functions

samTOOLs = "samtools "

def processDataset(inputfolder):
    print(inputfolder)
    ds = inputfolder.rsplit("/", 2)[1]
    print(ds)
    argumentList = []
    for file in os.listdir(inputfolder):
        if os.path.isfile(inputfolder+file):
            if file.endswith(".fa") or file.endswith(".fasta"):           
                sample = file.replace(".fasta","").replace(".fa","")
                odir = OUTPUT_DIR + ds + "/" + sample
                s1 = inputfolder + file
                tup = (ds, odir, sample, s1)
                argumentList.append(tup)
            else:
                print(file + " is not a fasta file, PASS")
        else:
            print(file + " is not a file! PASS")
            
    print(str(len(argumentList)) + " samples to process")
    
    if not checkKrakenInstall():
        print("Problem with Kraken Installation. ABORT")
        return()

    allNucPerTaxList = parmap.starmap(runOneSample, argumentList, pm_processes=1, pm_pbar=True)
    
    print("merge and save results")
    allNucPerTax = dict()
    for i in range(len(argumentList)):
        allNucPerTax[argumentList[i][3]] = allNucPerTaxList[i]

    dsdf = mergeNucTaxDataframes(allNucPerTax)
    dsdf.to_csv(OUTPUT_DIR + ds + "/" + ds + "_contamination.csv")
    print(ds + " done")
    return(dsdf)

def runOneSample(ds, outputdir, samplename, sample1):   
    print(outputdir)
    if(os.path.isdir(outputdir)):
        print(outputdir + " ALREADY EXISTS")
    else:
        os.makedirs(outputdir)
    outdir_prefix = outputdir + "/" + samplename
    logfile = open(outdir_prefix + "_pathonoiaLOG_"+datetime.datetime.now().strftime("%Y%m%d")+".log", "a") 
    
    print("++++++++++++++", file = logfile)
    print(samplename + "\n", file = logfile)
    
    print("Kraken2", file = logfile)
    if (os.path.isfile(outdir_prefix + "_krakenalign.txt")):
        print("Kraken results already available, skip", file = logfile)
    else:
        execute(KRAKENcommand(outdir_prefix, sample1),logfile)

    print("evaluate", file = logfile)
    nucCutOff = 100
    print("nucCutOff = " + str(nucCutOff), file = logfile)
    nucPerTax = pa.evalKrakenAlign(outdir_prefix, sample1, nucCutOff, logfile)
    print("DONE", file = logfile)
    logfile.close()
    print(samplename + " done")
    return(nucPerTax)

def execute(cmd, logfile):
    print(cmd, file = logfile)
    answer = "cmd didnt run, uncommented"
    answer = os.system(cmd)
    print(answer, file = logfile)    

def samToolsCmdExtractUnaligned(outdir_prefix):
    samPARAMETERS = "view -hb -f 4 "
    samplePARAMETER = outdir_prefix + "_staralign.bam"
    output = " > " + outdir_prefix + "_humanUnmapped.bam"
    cmd = samTOOLs + samPARAMETERS + samplePARAMETER + output
    return(cmd)   
    
def samToolsCmdUnalignedToFA(outdir_prefix):
    samPARAMETERS = "view "
    samplePARAMETER = outdir_prefix + "_humanUnmapped.bam"
    awkCMD =  " | awk '{OFS=\"\\t\"; print \">\"$1\"\\n\"$10}'"
    output = " - > " + outdir_prefix + "_humanUnmapped.fa"
    cmd = samTOOLs + samPARAMETERS + samplePARAMETER + awkCMD + output
    return(cmd)  

def KRAKENcommand(outdir_prefix, sample):
    #if you want to change settings here, make sure the spaces in the string stay the way they are
    krakenPARAMETERS = "--use-names --threads 8 "
    outputPARAMETER = "--output " + outdir_prefix + "_krakenalign.txt --report " + outdir_prefix + "_krakenreport.txt "
    samplePARAMETER = sample
    cmd = krakenTOOL + " " + krakenPARAMETERS + outputPARAMETER + samplePARAMETER
    return(cmd)

def checkKrakenInstall():
    if os.path.isfile(krakenTOOL) and os.path.isdir(krakenDB):
        return(True)
    else: 
        return(False)
        
def mergeNucTaxDataframes(allNucPerTax):
    infocols = ['species_name', 'phylo_level', 'parent']
    
    sampleList = list(allNucPerTax.keys())
    listOfTaxInfo = []
    for sample, df in allNucPerTax.items():
        listOfTaxInfo.append(df[infocols])

    taxInfoDF = pd.concat(listOfTaxInfo)
    taxInfoDF = taxInfoDF.drop_duplicates()
    
    workingDF = allNucPerTax[sampleList[0]]    
    workingDF.drop(infocols, axis=1, inplace=True)
    workingDF.columns = [sampleList[0]]
    for i in range(1,len(sampleList)):
        toBeAddedDF = allNucPerTax[sampleList[i]]
        toBeAddedDF.drop(infocols, axis=1, inplace=True)
        toBeAddedDF.columns = [sampleList[i]]
        workingDF = pd.concat([workingDF, toBeAddedDF], axis=1, sort=False)

    workingDF["rowSum"] = workingDF.sum(axis = 1)
    workingDF = pd.concat([taxInfoDF,workingDF], axis = 1, sort=False)
    workingDF.sort_values(by = 'rowSum', ascending=0, inplace = True)
    workingDF.drop("rowSum", axis = 1, inplace=True)
    return(workingDF)
 

In [17]:
ds_contamination = processDataset(inputfolder)

./dataset/
dataset
5 samples to process
./OUTPUT/dataset/buccal_humanUnmapped
./OUTPUT/dataset/hous1_humanUnmapped
./OUTPUT/dataset/soil_humanUnmapped


FileNotFoundError: [Errno 2] No such file or directory: './OUTPUT/dataset/buccal_humanUnmapped/buccal_humanUnmapped_krakenalign.txt'

In [None]:

ds_contamination.loc[ds_contamination.phylo_level.str.contains("S", na=False),]

In [52]:
nucCutOff = 100
df = pd.DataFrame.from_dict(taxMap, orient='index', columns=['nucleotides'])
df.sort_values(by = "nucleotides", ascending=False, inplace=True)
df_all = df.join(taxInfo)
df = df_all.loc[df_all.phylo_level.str.contains("S", na=False),]
df = df.loc[df["nucleotides"]>nucCutOff,]

for taxId in df.index:
    species_nucs = df.loc[taxId,"nucleotides"]
    parent = df.loc[taxId,"parent"]
    level = taxInfo.loc[parent,"phylo_level"]
    while("S" in level or "G" in level or "F" in level):
        if(parent in df_all.index):
            species_nucs = species_nucs + df_all.loc[parent,"nucleotides"]
        parent = taxInfo.loc[parent,"parent"]
        level = taxInfo.loc[parent,"phylo_level"]
    df.loc[taxId,"nucleotides"] = species_nucs

In [53]:
df

Unnamed: 0,nucleotides,species_name,phylo_level,parent
1969841,57530,Proteus phage VB_PmiS-Isfahan,S,2315205
1747,47554,Cutibacterium acnes,S,1912216
305,26988,Ralstonia solanacearum,S,48736
527028,22983,Bacillus thuringiensis serovar pulsiensis BGSC...,S2,180877
166122,19045,Human endogenous retrovirus K113,S1,45617
...,...,...,...,...
68895,2014,Cupriavidus basilensis,S,106589
72000,389,Kocuria rhizophila,S,57493
69395,1240,Caulobacter henricii,S,75
1017,2093,Capnocytophaga gingivalis,S,1016


In [None]:
# setup Kraken 2 for Pathonoia
#
git clone https://github.com/DerrickWood/kraken2.git
cd kraken2/
./install_kraken2.sh .
mkdir db
./kraken2-build --download-taxonomy --db db/bacvir_k31 #?--skip-maps
./kraken2-build --download-library bacteria --db db/bacvir_k31 --use-ftp
./kraken2-build --download-library viral --db db/bacvir_k31 --use-ftp

./kraken2-build --build --db db/bacvir_k31 --threads 8 --kmer-len 31 --minimizer-len 31

./kraken2-build --clean --db db/bacvir_k31