In [None]:
import pandas as pd
import multiprocessing
import parmap
import datetime
import os
from pathonoia import pathonoia

In [None]:
# this folder will be created and for every dataset and sample corresponding subfolders
OUTPUT_DIR = "./OUTPUT/"

# The Inputfolder should contain one or more fasta files
# These files should be samples of the same dataset
# if the dataset was originally host RNAseq, align it first to the host's transcriptome and extract the unmapped reads
# the fasta files should contain all (host-)unmapped reads 
# the unmapped reads can be extracted from the alignment BAM file with SAMtools:

# samtools view -hb -f 4 host_align.bam > host_unmapped.bam
# samtools view host_unmapped.bam | awk \'{OFS="\\t"; print ">"$1"\\n"$10}\' - > host_unmapped.fa

inputfolder = "./dataset/"

# make sure Kraken 2 is installed on your system along with its index build with k=31 AND l=31 
# Instructions for creating this index can be found at the end of this notebook
krakenTOOL="../tools/kraken2/kraken2"
krakenDB="../tools/kraken2/db/kraken2_k31"
pathonoia.checkKrakenInstall(krakenTOOL,krakenDB)

In [None]:
# run this cell, these are definitions / functions, only

def processDataset(inputfolder):
    print(inputfolder)
    ds = inputfolder.rsplit("/", 2)[1]
    print(ds)
    argumentList = []
    for file in os.listdir(inputfolder):
        if os.path.isfile(inputfolder+file):
            if file.endswith(".fa") or file.endswith(".fasta"):           
                sample = file.replace(".fasta","").replace(".fa","")
                odir = OUTPUT_DIR + ds + "/" + sample
                s1 = inputfolder + file
                tup = (ds, odir, sample, s1)
                argumentList.append(tup)
            else:
                print(file + " is not a fasta file, PASS")
        else:
            print(file + " is not a file! PASS")
            
    print(str(len(argumentList)) + " samples to process")
    
    if not pathonoia.checkKrakenInstall(krakenTOOL, krakenDB):
        print("Problem with Kraken Installation. ABORT")
        return()

    allNucPerTaxList = parmap.starmap(runOneSample, argumentList, pm_processes=1, pm_pbar=True)
    
    print("merge and save results")
    allNucPerTax = dict()
    for i in range(len(argumentList)):
        allNucPerTax[argumentList[i][3]] = allNucPerTaxList[i]

    dsdf = pathonoia.mergeNucTaxDataframes(allNucPerTax)
    dsdf.to_csv(OUTPUT_DIR + ds + "/" + ds + "_contamination.csv")
    print(ds + " done")
    return(dsdf)

def runOneSample(ds, outputdir, samplename, sample1):   
    print(outputdir)
    if(os.path.isdir(outputdir)):
        print(outputdir + " ALREADY EXISTS")
    else:
        os.makedirs(outputdir)
    outdir_prefix = outputdir + "/" + samplename
    logfile = open(outdir_prefix + "_pathonoiaLOG_"+datetime.datetime.now().strftime("%Y%m%d")+".log", "a") 
    
    print("++++++++++++++", file = logfile)
    print(samplename + "\n", file = logfile)
    
    print("Kraken2", file = logfile)
    if (os.path.isfile(outdir_prefix + "_krakenalign.txt")):
        print("Kraken results already available, skip", file = logfile)
    else:
        execute(KRAKENcommand(outdir_prefix, sample1),logfile)

    print("evaluate", file = logfile)
    nucCutOff = 100
    print("nucCutOff = " + str(nucCutOff), file = logfile)
    nucPerTax = pathonoia.evalKrakenAlign(outdir_prefix, sample1, nucCutOff, logfile)
    print("DONE", file = logfile)
    logfile.close()
    print(samplename + " done")
    return(nucPerTax)

def execute(cmd, logfile):
    print(cmd, file = logfile)
    answer = "cmd didnt run, uncommented"
    answer = os.system(cmd)
    print(answer, file = logfile)    

def KRAKENcommand(outdir_prefix, sample):
    #if you want to change settings here, make sure the spaces in the string stay the way they are
    krakenPARAMETERS = "--db " + krakenDB + " --use-names --threads 8 "
    outputPARAMETER = "--output " + outdir_prefix + "_krakenalign.txt --report " + outdir_prefix + "_krakenreport.txt "
    samplePARAMETER = sample
    cmd = krakenTOOL + " " + krakenPARAMETERS + outputPARAMETER + samplePARAMETER
    return(cmd)

In [None]:
processDataset(inputfolder)

In [None]:
# TO BE COMMENTED 
# CHECK IN AT LATER STATE

ds_contamination = processDataset(inputfolder)

In [None]:
ds_contamination.loc[ds_contamination.phylo_level.str.contains("S", na=False),]

In [None]:
nucCutOff = 100
df = pd.DataFrame.from_dict(taxMap, orient='index', columns=['nucleotides'])
df.sort_values(by = "nucleotides", ascending=False, inplace=True)
df_all = df.join(taxInfo)
df = df_all.loc[df_all.phylo_level.str.contains("S", na=False),]
df = df.loc[df["nucleotides"]>nucCutOff,]

for taxId in df.index:
    species_nucs = df.loc[taxId,"nucleotides"]
    parent = df.loc[taxId,"parent"]
    level = taxInfo.loc[parent,"phylo_level"]
    while("S" in level or "G" in level or "F" in level):
        if(parent in df_all.index):
            species_nucs = species_nucs + df_all.loc[parent,"nucleotides"]
        parent = taxInfo.loc[parent,"parent"]
        level = taxInfo.loc[parent,"phylo_level"]
    df.loc[taxId,"nucleotides"] = species_nucs

In [None]:
df

# setup Kraken 2 for Pathonoia
follow these commands one by one to build a Kraken Index that works with Pathonoia

    git clone https://github.com/DerrickWood/kraken2.git
    cd kraken2/
    ./install_kraken2.sh .
    mkdir db
    ./kraken2-build --download-taxonomy --db db/bacvir_k31 #?--skip-maps
    ./kraken2-build --download-library bacteria --db db/bacvir_k31 --use-ftp
    ./kraken2-build --download-library viral --db db/bacvir_k31 --use-ftp

    ./kraken2-build --build --db db/bacvir_k31 --threads 8 --kmer-len 31 --minimizer-len 31

    ./kraken2-build --clean --db db/bacvir_k31