In [40]:
import pandas as pd
import matplotlib as mp
import numpy as np
import os
pd.options.display.max_colwidth = 999

In [127]:
chrDir = "./tsv/chromosome/"
plasDir = "./tsv/plasmid/"

chromosomes = {}
plas = {}

for file in os.listdir(chrDir):
    index = file.split('.')[1]
    try:
        chromosomes[index] = pd.read_csv(chrDir + file, sep='\t', comment='#', header=None)
        chromosomes[index].columns = ["identity", "shared-hashes", "median-multiplicity", "p-value", "query-ID", "query-comment"]
    except:
        chromosomes[index] = "NA" 
for file in os.listdir(plasDir):
    index = file.split('.')[1]
    try:
        plas[index] = pd.read_csv(plasDir + file, sep='\t', comment='#', header=None)
        plas[index].columns = ["identity", "shared-hashes", "median-multiplicity", "p-value", "query-ID", "query-comment"]
    except:
        plas[index] = pd.DataFrame()

taxa = pd.read_csv("taxa_metadata.tsv", sep='\t', comment='#', header=None)
taxa.columns = ["spp","bioid","accesion","type"]
taxaMap = {}
for index,row in taxa.iterrows():
    key = taxa.loc[index,"accesion"]
    value = row
    taxaMap[key]=value
print (taxaMap)

        

{'NC_009338': spp         Mycobacterium gilvum
bioid               SAMN02598346
accesion               NC_009338
type                  chromosome
Name: 0, dtype: object, 'NC_009339': spp         Mycobacterium gilvum
bioid               SAMN02598346
accesion               NC_009339
type                     plasmid
Name: 1, dtype: object, 'NC_009340': spp         Mycobacterium gilvum
bioid               SAMN02598346
accesion               NC_009340
type                     plasmid
Name: 2, dtype: object, 'NC_009341': spp         Mycobacterium gilvum
bioid               SAMN02598346
accesion               NC_009341
type                     plasmid
Name: 3, dtype: object, 'NC_007606': spp         Shigella dysenteriae
bioid               SAMN02603600
accesion               NC_007606
type                  chromosome
Name: 4, dtype: object, 'NC_009344': spp         Shigella dysenteriae
bioid               SAMN02603600
accesion               NC_009344
type                     plasmid
Name: 5, 

In [151]:
def getReferences(listOfMashHits, taxaMap):
    bestReference = []
    
    for key in listOfMashHits:
        bestHit = 0
        for index,rows in listOfMashHits[key].iterrows():
            curHash = int(listOfMashHits[key].loc[index, "shared-hashes"].split('/')[0])
            accession = listOfMashHits[key].loc[index,"query-comment"].split(' ')[0].split('.')[0]
            accType = taxaMap[accession].loc["type"]
            spp = taxaMap[accession].loc["spp"]
            grade = ""
            if (curHash >= bestHit and curHash > 10):
                bestHit = curHash
                grade = "best"
            elif (curHash < bestHit and curHash >= (bestHit - 100) and curHash >= 10 or curHash > 500):
                grade = "secondary"        
            elif(curHash >= 10):
                grade = "low"        
            else:
                grade = "noise"         
            bestReference.append(str(key)+ "\t" + grade+ "\t" + accession + "\t" + 
                                    accType + "\t" + spp + "\t" + str(curHash) + "\t" + 
                                    str(listOfMashHits[key].loc[index,"identity"]))
    return bestReference

bestChrReference = getReferences(chromosomes, taxaMap)
bestPlasReference = getReferences(plas, taxaMap)


In [154]:
with open("chrRef.tsv", 'w') as f:
    for item in bestChrReference:
        f.write("%s\n" % item)

with open("plasRef.tsv", 'w') as f:
    for item in bestPlasReference:
        f.write("%s\n" % item)