In [15]:
from __future__ import print_function
import glob
import os
import os.path as op
from itertools import groupby
import shutil
from collections import defaultdict

In [35]:
def read_fasta(file_handle):
    '''Fasta iterator'''
    for header, group in groupby(file_handle, lambda line: line[0] == '>'):
        if header:
            line = next(group)
            name = line[1:].strip()
        else:
            seq = ''.join(line.strip() for line in group)
            yield name, seq
            
def safe_mkdir(f):
    if op.exists(f)==False: 
        os.mkdir(f)
    return f

In [12]:
genomes = glob.glob("../*.fasta")

In [13]:
old = '''1.064.O._10N.261.52.E5
1.087.A._10N.261.45.F7
1.117.O._10N.261.45.E8
1.293.O._10N.261.52.E4
2.117.O._10N.261.45.E8'''

new = '''1.064.O._10N.261.52.E2
1.087.A._10N.261.45.F9
1.117.O._10N.261.45.E9
1.293.O._10N.261.52.E1
2.117.O._10N.261.45.E9'''

namechange = dict(zip(old.split(), new.split()))

In [28]:
if op.exists == False:
    os.mkdir("../fastas_badnames")

for f in genomes:
    newfile = op.join("../fasta/",op.basename(f))
    with open(newfile, "w") as outfast:
        for name, seq in read_fasta(open(f)):
            name1 = name.replace("Vibriophage_","")
            if name1 in namechange.keys():
                print(name1)
                print(namechange[name1])
                newname = "Vibrio_virus_{nn}".format(nn=namechange[name1])
            else:
                newname = "Vibrio_virus_{name1}".format(**locals())
            
            print(">"+newname, file=outfast)
            for i in range(0, len(seq), 80):
                print(seq[i:i+80], file=outfast)
    shutil.move(f, "../fastas_badnames/")

1.064.O._10N.261.52.E5
1.064.O._10N.261.52.E2
1.087.A._10N.261.45.F7
1.087.A._10N.261.45.F9
1.117.O._10N.261.45.E8
1.117.O._10N.261.45.E9
1.293.O._10N.261.52.E4
1.293.O._10N.261.52.E1
2.117.O._10N.261.45.E8
2.117.O._10N.261.45.E9


In [56]:
safe_mkdir("../gff_oldnames")
safe_mkdir("../gff_newnames")

for i, f in enumerate(glob.glob("../gff/*")):
    #if i==1:
    #    break
    new_gff = op.join("../gff_newnames/", op.basename(f).replace(".sf",""))
    with open(f) as ih:
        for j, g in enumerate(ih):
            if j == 1:
                break
            vec = g.strip("\n").split("\t")
            name = vec[0]
            name1 = name.replace("Vibriophage_","")
            if name1 in namechange.keys():
                special = True
                newname = "Vibrio_virus_{nn}".format(nn=namechange[name1])
            else:
                newname = "Vibrio_virus_{name1}".format(**locals())
                special = False
    with open(f) as ih, open(new_gff, "w") as oh:
        for l in ih:
            vec = l.strip("\n").split("\t")
            print("\t".join([newname]+vec[1:]), file=oh)
    shutil.move(f, "../gff_oldnames/")

In [28]:
# dataset totals
tara_clusts = defaultdict(lambda:0)
no_match_tara = 0
match_tara = 0

for i, f in enumerate(glob.glob("../gff_newnames/*")):
    #if i == 10:
    #    break
    with open(f) as ih:
        for j, g in enumerate(ih):
            #if j == 3:
            #    break
            vec = g.strip("\n").split("\t")[-1]
            if "tara" in vec:
                vec2 = vec.split(";")
                for v2 in vec2:
                    if "tara" in v2: 
                        clust = v2.split("=")[1].replace('"','').split(":")[1]
                        tara_clusts[clust] += 1
                        match_tara += 1
                    else:
                        no_match_tara += 1
print("{match_tara} proteins matched {tara_clusts} "
      "different Tara OM-RGC protien clusters".format(match_tara=match_tara,
                                                      
                                                      tara_clusts = len(tara_clusts)))
print(no_match_tara, "proteins had no match to Tara protein clusters")

8539 proteins matched 2454 different Tara OM-RGC protien clusters
34944 proteins had no match to Tara protein clusters


These numbers feel slightly disingenuous as we didn't use the same "clustering" parameters to put these proteins into TARA OM-RGC protein clusters but rather we used a blast-based method.

In [20]:
# individual phages

for i, f in enumerate(glob.glob("../gff_newnames/*")):

    #if i == 10:
    #    break
    with open(f) as ih:
        for j, g in enumerate(ih):
            #if j == 3:
            #    break
            vec = g.strip("\n").split("\t")[-1]
            if "tara" in vec:
                vec2 = vec.split(";")
                for v2 in vec2:
                    if "tara" in v2: 
                        clust = v2.split("=")[1].replace('"','').split(":")[1]
                        tara_clusts[clust] += 1
                    else:
                        no_match_tara += 1

defaultdict(<function __main__.<lambda>>,
            {'OM-RGC.v1.023737250': 1,
             'OM-RGC.v1.018565250': 1,
             'OM-RGC.v1.026227680': 2,
             'OM-RGC.v1.029727015': 3,
             'OM-RGC.v1.027364285': 1,
             'OM-RGC.v1.030179339': 2,
             'OM-RGC.v1.001269727': 1,
             'OM-RGC.v1.004240700': 2,
             'OM-RGC.v1.023989711': 2,
             'OM-RGC.v1.012933651': 1,
             'OM-RGC.v1.000007770': 3,
             'OM-RGC.v1.037615457': 3,
             'OM-RGC.v1.019206931': 1,
             'OM-RGC.v1.000292742': 2,
             'OM-RGC.v1.019206935': 1,
             'OM-RGC.v1.000423395': 1,
             'OM-RGC.v1.004396561': 2,
             'OM-RGC.v1.035324058': 1,
             'OM-RGC.v1.013263615': 1,
             'OM-RGC.v1.015585201': 6,
             'OM-RGC.v1.002498149': 1,
             'OM-RGC.v1.022658720': 2,
             'OM-RGC.v1.030257014': 1,
             'OM-RGC.v1.004580008': 1,
             'OM-RGC.v