##GOAL: Get the tentative GO annotations for the Ehux transcriptomes from the Pfam and InterproData a la JGI 

In [155]:
import pandas as pd
import matplotlib.pyplot as mpl 
from Bio import SeqIO
%matplotlib inline

In [3]:
##Load in interpro hash

#Form : Interpro2Go[Interpro]=[GOterms]
InterPro2Go={}
f=open("Interpro2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        InterPro=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if InterPro in InterPro2Go.keys():
            InterPro2Go[InterPro].append(GO)
        else:
            InterPro2Go[InterPro]=[GO]
    
            
##Load in Pfam hash

#Form : Pfam2Go[Interpro]=[GOterms]
Pfam2Go={}
f=open("Pfam2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        Pfam=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if Pfam in Pfam2Go.keys():
            Pfam2Go[Pfam].append(GO)
        else:
            Pfam2Go[Pfam]=[GO]
            

In [130]:
#Emi219|CAMPEP_0187750186
def nameConversion(inPepFile):
    f=open(inPepFile, 'r')
    NCGRIDConv={}
    strainNum=[s for s in inPepFile if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    for item in SeqIO.parse(f, 'fasta'):
        CAMPEP=item.id
        desc=item.description.strip()
        NCGRID=desc.split('/')[1].split('=')[1].strip()
        NCGRIDConv[NCGRID]=StrainName+'|'+CAMPEP
    return NCGRIDConv, StrainName

def GetInterPro(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Interpro_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'InterPro' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            InterPro=l[1].split(';')[3].split('=')
            Inter_list=InterPro[1].split(',')
            InterPro_finalList=[]
            for i in Inter_list:
                ii=i.split(':')[1]
                InterPro_finalList.append(ii)
            InterPro_finalList=set(InterPro_finalList)
            if name in Interpro_Hash:
                Interpro_Hash[name].union(InterPro_finalList)
            else: 
                Interpro_Hash[name]=InterPro_finalList
    return Interpro_Hash

def GetPfam(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Pfam_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'Pfam:' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            Pfam=l[1].split(';')[1].split('=')
            Pfam_list=Pfam[1].split(',')
            Pfam_finalList=[]
            for i in Pfam_list:
                ii=i.split(':')[1].split('.')[0]
                Pfam_finalList.append(ii)
            Pfam_finalList=set(Pfam_finalList)
            if name in Pfam_Hash:
                Pfam_Hash[name].union(Pfam_finalList)
            else: 
                Pfam_Hash[name]=Pfam_finalList
    return Pfam_Hash


GetPfam('Emi219/pfam.all')

{'Emiliania-huxleyi-PLYM219-20130905|12013_1': {'PF08755'},
 'Emiliania-huxleyi-PLYM219-20130905|173565_1': {'PF04961'},
 'Emiliania-huxleyi-PLYM219-20130905|172986_1': {'PF00036'},
 'Emiliania-huxleyi-PLYM219-20130905|30516_1': {'PF00069'},
 'Emiliania-huxleyi-PLYM219-20130905|23443_1': {'PF02656'},
 'Emiliania-huxleyi-PLYM219-20130905|159597_1': {'PF03151'},
 'Emiliania-huxleyi-PLYM219-20130905|19149_1': {'PF00439'},
 'Emiliania-huxleyi-PLYM219-20130905|12985_1': {'PF01926'},
 'Emiliania-huxleyi-PLYM219-20130905|13493_1': {'PF01694'},
 'Emiliania-huxleyi-PLYM219-20130905|29872_1': {'PF13921'},
 'Emiliania-huxleyi-PLYM219-20130905|18161_1': {'PF00857'},
 'Emiliania-huxleyi-PLYM219-20130905|8324_1': {'PF05991'},
 'Emiliania-huxleyi-PLYM219-20130905|35777_1': {'PF07653'},
 'Emiliania-huxleyi-PLYM219-20130905|13133_1': {'PF10408'},
 'Emiliania-huxleyi-PLYM219-20130905|61753_1': {'PF00106'},
 'Emiliania-huxleyi-PLYM219-20130905|261_1': {'PF00076'},
 'Emiliania-huxleyi-PLYM219-20130905|213

In [132]:
#Get all the interpro calls
import glob
Emi_InterPro_Hash={}
Emi_Pfam_Hash={}
NameConHash={}
StrainNames=[]
for file in glob.glob('*pep*'):
    strainNum=[s for s in file if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    StrainNames.append(StrainName)
    NameConHash[StrainName], StrainName=nameConversion(file)
    Emi_InterPro_Hash[StrainName]=GetInterPro(StrainName+'/interpro.all', rename=True, conHash=NameConHash[StrainName])
    Emi_Pfam_Hash[StrainName]=GetPfam(StrainName+'/pfam.all', rename=True, conHash=NameConHash[StrainName])


In [135]:
#Convert interpro to GO
GO_Hash={}
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_InterPro_Hash[strain]):
        for item in Emi_InterPro_Hash[strain][entry]:
            if item in InterPro2Go.keys():
                if entry in GO_tmp.keys():
                    newSet=GO_tmp[entry].union(set(InterPro2Go[item]))
                    GO_tmp[entry]=newSet
                else:
                    GO_tmp[entry]=set(InterPro2Go[item])
    GO_Hash[strain]=GO_tmp

        

In [136]:
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_Pfam_Hash[strain]):
        for item in Emi_Pfam_Hash[strain][entry]:
            if item in Pfam2Go.keys():
                if entry in GO_Hash[strain].keys():
                    newSet=GO_Hash[strain][entry].union(set(Pfam2Go[item]))
                    GO_Hash[strain][entry]=newSet
                else:
                    GO_Hash[strain][entry]=set(Pfam2Go[item])



In [141]:
import cPickle as cpk
cpk.dump(GO_Hash,open('GO_Term_Hash.pickle','w'))

In [154]:
GO_Hash

{'Emi219': {'Emi219|CAMPEP_0187745180': {'GO:0005515'},
  'Emi219|CAMPEP_0187749126': {'GO:0035091'},
  'Emi219|CAMPEP_0187749128': {'GO:0008152', 'GO:0016491'},
  'Emi219|CAMPEP_0187777444': {'GO:0005515'},
  'Emi219|CAMPEP_0187779602': {'GO:0004842'},
  'Emi219|CAMPEP_0187804586': {'GO:0016021', 'GO:0055085'},
  'Emi219|CAMPEP_0187804580': {'GO:0016307', 'GO:0046488'},
  'Emi219|CAMPEP_0187784104': {'GO:0016051'},
  'Emi219|CAMPEP_0187788484': {'GO:0004672', 'GO:0005524', 'GO:0006468'},
  'Emi219|CAMPEP_0187741312': {'GO:0005247',
   'GO:0006821',
   'GO:0016020',
   'GO:0055085'},
  'Emi219|CAMPEP_0187746470': {'GO:0016020',
   'GO:0030001',
   'GO:0046873',
   'GO:0055085'},
  'Emi219|CAMPEP_0187775014': {'GO:0008033',
   'GO:0017150',
   'GO:0050660',
   'GO:0055114'},
  'Emi219|CAMPEP_0187759506': {'GO:0005515'},
  'Emi219|CAMPEP_0187759012': {'GO:0006508', 'GO:0008236'},
  'Emi219|CAMPEP_0187739840': {'GO:0004672', 'GO:0005524', 'GO:0006468'},
  'Emi219|CAMPEP_0187766002': {'GO:

In [153]:
len(qre


Error while parsing the string.
