In [176]:
import pandas as pd
import numpy as np

#### import contig-host info 

In [177]:
#import crispr results
crispr = pd.read_csv("../../tmp2/10_contaminants/pilecr/crispr.finalOutput.csv",header=0,sep='\t')
#import mvp results
mvp = pd.read_csv("../../tmp2/10_contaminants/MVP/mvp.finaloutput.csv",header=0,sep='\t')

In [178]:
#tax info for the crispr table was obtained from batch entrez
taxInfocrispr = pd.read_csv("../../tmp2/10_contaminants/pilecr/taxInfoCrisprTable.csv",header=0,sep='\t')
taxInfocrispr = dict(zip(taxInfocrispr["acc"].values,taxInfocrispr["taxid"]))
getTax = lambda x:taxInfocrispr[x]
crispr["taxid"] = crispr["qaccver"].apply(getTax)

In [179]:
#combine info crispr and mvp
crispr = crispr[["saccver","qaccver","lineage","name","taxid"]]
mvp = mvp[["qid","sid","lineage","hostName","host_taxon_id"]]
columns = ["contig","sid","lineage","hostName","host_taxon_id"]
crispr.columns = columns
mvp.columns = columns
output = pd.concat([crispr,mvp])

#### strains tree

In [5]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

tree = ncbi.get_topology(output["host_taxon_id"].unique(),intermediate_nodes=True)

#### Genus tree

In [6]:
def getGenusNode(node):
    parent = node.get_ancestors()[0]
    if str(parent.rank) == "genus":
        return parent
    else:
        return getGenusNode(parent)

In [7]:
genusTaxIds = []
noGenus = [] #genomes that do not have a defined genus. 
for l in tree.get_leaves():
    if str(l.rank) == "genus":
        genusTaxIds.append(l.name)
    else:
        try:
            genusNode = getGenusNode(l)
            genusTaxIds.append(genusNode.name)
        except:
            noGenus.append(l.name)
genusTaxIds += noGenus

In [8]:
genusTree = ncbi.get_topology(list(set(genusTaxIds)),intermediate_nodes=True)

In [22]:
#write nwk tree
genusTree.write(format=9,outfile="../../tmp2/10_contaminants/pilecr/hostsTree.nwk")

In [23]:
%%writefile ../../tmp2/10_contaminants/pilecr/changeNames.txt
LABELS
#use this template to change the leaf labels, or define/change the internal node names (displayed in mouseover popups)

#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).

#SEPARATOR TAB
#SEPARATOR SPACE
SEPARATOR COMMA

#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA
#NODE_ID,LABEL

#Examples
#define a name for an internal node
#9031|9606,Metazoa

#change the label for leaf node 9606
#9606,Homo sapiens

Overwriting ../../tmp2/10_contaminants/pilecr/changeNames.txt


In [24]:
fa = open("../../tmp2/10_contaminants/pilecr/changeNames.txt","a")
for l in genusTree.get_leaves():
    print >> fa, l.name+','+l.sci_name
fa.close()

#### Append species and genus info

In [72]:
def getPhylumNode(node):
    parent = node.get_ancestors()[0]
    if str(parent.rank) == "species":
        return parent
    else:
        return getPhylumNode(parent)

In [90]:
tax2species = {}
for taxid in output["host_taxon_id"].unique():
    try:
        taxid = str(taxid)
        node = tree.search_nodes(name=taxid)[0]
        if str(node.rank) == "species":
            tax2species[taxid] = node.name
        else:
            snode = getPhylumNode(node)
            tax2species[taxid] = snode.name
    except:
        tax2species[taxid] = None #as the taxid is above level species, typically genus  

In [92]:
species = lambda x : tax2species[str(x)]
output["species"] = output["host_taxon_id"].apply(species)

In [94]:
def getGenusNode(node):
    parent = node.get_ancestors()[0]
    if str(parent.rank) == "genus":
        return parent
    else:
        return getGenusNode(parent)

In [95]:
tax2genus = {}
for taxid in output["host_taxon_id"].unique():
    try:
        taxid = str(taxid)
        node = tree.search_nodes(name=taxid)[0]
        if str(node.rank) == "genus":
            tax2genus[taxid] = node.name
        else:
            snode = getGenusNode(node)
            tax2genus[taxid] = snode.name
    except:
        tax2genus[taxid] = taxid

In [96]:
genus = lambda x : tax2genus[str(x)]
output["genus"] = output["host_taxon_id"].apply(genus)

In [160]:
%%writefile ../../tmp2/10_contaminants/pilecr/species_multibar.txt

DATASET_MULTIBAR
#In multi-value bar charts, each ID is associated to multiple numeric values, which are displayed as a stacked or aligned bar chart
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
#SEPARATOR SPACE
SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL,example multi bar chart

#dataset color (can be changed later)
COLOR,#ff0000

#define colors for each individual field column (use hexadecimal, RGB or RGBA notation; if using RGB/RGBA, COMMA cannot be used as SEPARATOR)
FIELD_COLORS,#ff0000

#field labels
FIELD_LABELS,Species

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#

#dataset scale: you can simply set the values where the scale will be drawn
#DATASET_SCALE,2000,10000,20000
#or you can specify value, label and color for each scale line (dash separated, format: VALUE-LABEL-COLOR) 
#DATASET_SCALE,2000-2k line-#0000ff,10000-line at 10k-#ff0000,20000-3rd line-#00ff00

#each dataset can have a legend, which is defined below
#for each row in the legend, there should be one shape, color and label
#shape should be a number between 1 and 6:
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Legend
#LEGEND_SHAPES,1,1
#LEGEND_COLORS,#ff0000,#00ff00
#LEGEND_LABELS,Number of species

#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#maximum width
WIDTH,500

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
MARGIN,100

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL,0

#bar height factor; Default bar height will be slightly less than the available space between leaves, but you can set a multiplication factor here to increase/decrease it (values from 0 to 1 will decrease it, values above 1 will increase it)
#HEIGHT_FACTOR,1

#Bars are aligned to the node lines by default. Using BAR_SHIFT, you can move them all up/down by a fixed amount
#BAR_SHIFT,0

#align individual fields; if set to 1, individual bar charts will not be stacked
#ALIGN_FIELDS,0

#border width; if set above 0, a border of specified width (in pixels) will be drawn around the bars
#BORDER_WIDTH,0

#border color; used when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff

#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA
#ID1,value1,value2,value3
#ID2,value4,value5,value6
#9606,10000,15000,9000
#LEAF1|LEAF2,11000,9000,120007


Overwriting ../../tmp2/10_contaminants/pilecr/species_multibar.txt


In [161]:
#ring1 = number of species in that genus
ring1Dict = {}
for genus in output["genus"].unique():
    ring1Dict[genus] = output[output["genus"]==genus]["species"].unique().shape[0]

In [162]:
fw = open("../../tmp2/10_contaminants/pilecr/species_multibar.txt",'a')
for genus in ring1Dict:
    print >> fw, genus+","+str(ring1Dict[genus])
fw.close()

In [163]:
%%writefile ../../tmp2/10_contaminants/pilecr/phages_multibar.txt

DATASET_MULTIBAR
#In multi-value bar charts, each ID is associated to multiple numeric values, which are displayed as a stacked or aligned bar chart
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
#SEPARATOR SPACE
SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL,phages

#dataset color (can be changed later)
COLOR,#00ff00

#define colors for each individual field column (use hexadecimal, RGB or RGBA notation; if using RGB/RGBA, COMMA cannot be used as SEPARATOR)
FIELD_COLORS,#00ff00

#field labels
FIELD_LABELS,Phages

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#

#dataset scale: you can simply set the values where the scale will be drawn
#DATASET_SCALE,2000,10000,20000
#or you can specify value, label and color for each scale line (dash separated, format: VALUE-LABEL-COLOR) 
#DATASET_SCALE,2000-2k line-#0000ff,10000-line at 10k-#ff0000,20000-3rd line-#00ff00

#each dataset can have a legend, which is defined below
#for each row in the legend, there should be one shape, color and label
#shape should be a number between 1 and 6:
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Legend
#LEGEND_SHAPES,1,1
#LEGEND_COLORS,#ff0000,#00ff00
#LEGEND_LABELS,Number of phages

#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#maximum width
WIDTH,500

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
MARGIN,50

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL,0

#bar height factor; Default bar height will be slightly less than the available space between leaves, but you can set a multiplication factor here to increase/decrease it (values from 0 to 1 will decrease it, values above 1 will increase it)
#HEIGHT_FACTOR,1

#Bars are aligned to the node lines by default. Using BAR_SHIFT, you can move them all up/down by a fixed amount
#BAR_SHIFT,0

#align individual fields; if set to 1, individual bar charts will not be stacked
#ALIGN_FIELDS,0

#border width; if set above 0, a border of specified width (in pixels) will be drawn around the bars
#BORDER_WIDTH,0

#border color; used when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff

#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA
#ID1,value1,value2,value3
#ID2,value4,value5,value6
#9606,10000,15000,9000
#LEAF1|LEAF2,11000,9000,120007


Overwriting ../../tmp2/10_contaminants/pilecr/phages_multibar.txt


In [164]:
#ring2 = num of unique phages infecting that genus
ring2Dict = {}
for genus in output["genus"].unique():
    ring2Dict[genus] = output[output["genus"]==genus]["contig"].unique().shape[0]

In [165]:
fw = open("../../tmp2/10_contaminants/pilecr/phages_multibar.txt",'a')
for genus in ring1Dict:
    print >> fw, genus+","+str(ring2Dict[genus])
fw.close()

In [180]:
output.head()

Unnamed: 0,contig,sid,lineage,hostName,host_taxon_id
0,9_P1_8Lpp_out_L5890,CP001107.1,"['[Eubacterium] rectale ATCC 33656', 'Bacteria...",[Eubacterium] rectale ATCC 33656,515619
1,9_P1_8Lpp_out_L5890,FP929042.1,"['[Eubacterium] rectale DSM 17629', 'Bacteria'...",[Eubacterium] rectale DSM 17629,657318
2,983_Ra2_8Lpp_out_L551,CP002780.1,"['Desulfotomaculum ruminis DSM 2154', 'Bacteri...",Desulfotomaculum ruminis DSM 2154,696281
3,970_B2_8Lpp_out_L3046,CP010432.1,"['Lactobacillus acidophilus', 'Bacteria', 'Fir...",Lactobacillus acidophilus,1579
4,970_B2_8Lpp_out_L3046,CP005926.2,"['Lactobacillus acidophilus La-14', 'Bacteria'...",Lactobacillus acidophilus La-14,1314884


In [173]:
output["hostName"].unique().shape

(249,)

In [174]:
output["species"].unique().shape

(226,)

In [175]:
output["genus"].unique().shape

(94,)