In [2]:
from __future__ import print_function
import glob
import os
import os.path as op
import pandas as pd

In [99]:
def ips_dict(ips):
    ips_columns = ["ProteinAcc", 
               "SeqMD5",
               "SeqLen",
               "Analysis",
               "SignatureAccession",
               "SignatureDescription",
               "StartLoc",
               "StopLoc",
               "Score",
               "Status",
               "Date",
               "InterProAnnotationAcc",
               "InterproAnnotationDesc",
               "GOAnnotation"]
    df = pd.read_csv(ips, sep="\t", index_col=False, names=ips_columns)
    best_hits = df[df.groupby(['ProteinAcc'])['Score'].transform(min)==df['Score']].drop_duplicates()
    ips_anns = {}

    for i, l in best_hits.iterrows():
        pid = l.ProteinAcc.replace("phage","_virus")
        source = "InterPro"
        if pd.isnull(l.InterproAnnotationDesc):
            if l.Analysis == "TMHMM" or l.Analysis == "Coils":
                desc = "{element} containing protein".format(element=l.SignatureAccession)
            else:
                if pd.isnull(l.SignatureDescription):
                    continue
                else:
                    desc = l.SignatureDescription
            ips_anns[pid] = [ desc, source]
        else:
            desc = l.InterproAnnotationDesc
            ipsid = l.InterProAnnotationAcc
            ips_anns[pid] = [desc,source, ipsid]

    return ips_anns

In [100]:
ips_dict(ipslist[2])

{'Vibrio_virus_1.005.O._10N.286.48.F2_1': ['Bacteriophage T5, Orf172 DNA-binding',
  'IPS Pfam',
  'IPR018306'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_12': ['Tail sheath protein',
  'IPS Pfam',
  'IPR007067'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_19': ['Homeodomain-like',
  'IPS SUPERFAMILY',
  'IPR009057'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_2': ['Coil containing protein',
  'IPS Coils'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_21': ['TMhelix containing protein',
  'IPS TMHMM'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_22': ['Coil containing protein',
  'IPS Coils'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_23': ['Bacteriophage T5, Orf172 DNA-binding',
  'IPS Pfam',
  'IPR018306'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_25': ['DNA circulation, N-terminal',
  'IPS Pfam',
  'IPR009826'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_26': ['Metallo-dependent phosphatase-like',
  'IPS SUPERFAMILY',
  'IPR029052'],
 'Vibrio_virus_1.005.O._10N.286.48.F2_28': ['DNA methylase, C-5 cytosine-specific, activ

In [91]:
class GffLine():
    def __init__(self, line):
        vec = line.strip().split("\t")
        self.name = vec[0]
        self.method = vec[1]
        self.etype = vec[2]
        self.cstart = vec[3]
        self.cstop = vec[4]
        self.dot = vec[5]
        self.strand = vec[6]
        self.something = vec[7]
        self.notes = ";".join([i for i in vec[8].split(";") if "ID=" not in i and "Name=" not in i])
        self.pid = [i for i in vec[8].split(";") if "ID=" in i][0]
        self.desc = [i for i in vec[8].split(";") if "Name=" in i][0]
        self.key = "{contig}_{number}".format(contig=self.name, number=int(self.pid.split("_")[-1]))
    
    def construct_note(self):
        notes = ";".join([self.pid, self.desc, self.notes])
                            
    def print_line(self):
        notes = ";".join([self.pid, self.desc, self.notes])
        line = "\t".join([self.name, self.method, self.etype, self.cstart, 
                          self.cstop, self.dot, self.strand, self.something, notes])
        return line
    
    def change_id(self, newid):
        self.pid = "ID={newid}".format(newid=newid)
        
    def change_desc(self, newdesc):
        self.desc = "Name={newdesc}".format(newdesc=newdesc)

In [110]:
def find_file_matches(ipslist, gfflist):
    for i in ipslist:
        phage = i.split("/")[-1].split("f")[0]
        ann = [j for j in gfflist if phage in j][0]
        yield phage, i, ann
    
def combine_ips_gff3(ips, gff3, outfile):
    ipsdict = ips_dict(ips)
    with open(gff3) as infile, open(outfile, "w") as oh:
        for l in infile:
            if "tRNAScan" not in l and "crispr" not in l.lower():
                line = GffLine(l)
                if line.key in ipsdict.keys():
                    line.change_desc(ipsdict[line.key][0])
                    line.notes += '; ontology_term="InterPro:{ips}"'.format(ips=ipsdict[line.key][-1])
                print(line.print_line(), file=oh)
            else:
                print(l, file=oh)
                

In [None]:
gfflist = glob.glob("../gff_newnames/*")
ipslist = glob.glob("../ips_calls/*")

In [105]:
if not op.exists("../combined_ips_gff3/"):
    os.mkdir("../combined_ips_gff3/")

In [111]:
for phage, i, g in find_file_matches(ipslist, gfflist):
    outfile = "../combined_ips_gff3/{phage}_ips_blast.gff3".format(phage=phage)
    combine_ips_gff3(i, g, outfile)