##GOAL: Get the tentative GO annotations for the Ehux transcriptomes from the Pfam and InterproData a la JGI 

In [58]:
import pandas as pd
import matplotlib.pyplot as mpl 
from Bio import SeqIO
import cPickle as cpk
%matplotlib inline

In [2]:
##Load in interpro hash

#Form : Interpro2Go[Interpro]=[GOterms]
InterPro2Go={}
f=open("Interpro2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        InterPro=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if InterPro in InterPro2Go.keys():
            InterPro2Go[InterPro].append(GO)
        else:
            InterPro2Go[InterPro]=[GO]
    
            
##Load in Pfam hash

#Form : Pfam2Go[Interpro]=[GOterms]
Pfam2Go={}
f=open("Pfam2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        Pfam=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if Pfam in Pfam2Go.keys():
            Pfam2Go[Pfam].append(GO)
        else:
            Pfam2Go[Pfam]=[GO]
            

In [3]:
#Emi219|CAMPEP_0187750186
def nameConversion(inPepFile):
    f=open(inPepFile, 'r')
    NCGRIDConv={}
    strainNum=[s for s in inPepFile if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    for item in SeqIO.parse(f, 'fasta'):
        CAMPEP=item.id
        desc=item.description.strip()
        NCGRID=desc.split('/')[1].split('=')[1].strip()
        NCGRIDConv[NCGRID]=StrainName+'|'+CAMPEP
    return NCGRIDConv, StrainName

def GetInterPro(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Interpro_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'InterPro' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            InterPro=l[1].split(';')[3].split('=')
            Inter_list=InterPro[1].split(',')
            InterPro_finalList=[]
            for i in Inter_list:
                ii=i.split(':')[1]
                InterPro_finalList.append(ii)
            InterPro_finalList=set(InterPro_finalList)
            if name in Interpro_Hash:
                Interpro_Hash[name]=Interpro_Hash[name].union(InterPro_finalList)
            else: 
                Interpro_Hash[name]=InterPro_finalList
    return Interpro_Hash

def GetPfam(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Pfam_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'Pfam:' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            Pfam=l[1].split(';')[1].split('=')
            Pfam_list=Pfam[1].split(',')
            Pfam_finalList=[]
            for i in Pfam_list:
                ii=i.split(':')[1].split('.')[0]
                Pfam_finalList.append(ii)
            Pfam_finalList=set(Pfam_finalList)
            if name in Pfam_Hash:
                Pfam_Hash[name]=Pfam_Hash[name].union(Pfam_finalList)
            else: 
                Pfam_Hash[name]=Pfam_finalList
    return Pfam_Hash


test=GetPfam('Emi219/pfam.all')

In [4]:
#Get all the interpro calls
import glob
Emi_InterPro_Hash={}
Emi_Pfam_Hash={}
NameConHash={}
StrainNames=[]
for file in glob.glob('*pep*'):
    strainNum=[s for s in file if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    StrainNames.append(StrainName)
    NameConHash[StrainName], StrainName=nameConversion(file)
    Emi_InterPro_Hash[StrainName]=GetInterPro(StrainName+'/interpro.all', rename=True, conHash=NameConHash[StrainName])
    Emi_Pfam_Hash[StrainName]=GetPfam(StrainName+'/pfam.all', rename=True, conHash=NameConHash[StrainName])


In [19]:
#Convert interpro to GO
InterPro_GO_Hash={}
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_InterPro_Hash[strain]):
        for item in Emi_InterPro_Hash[strain][entry]:
            if item in InterPro2Go.keys():
                if entry in GO_tmp.keys():
                    newSet=GO_tmp[entry].union(set(InterPro2Go[item]))
                    GO_tmp[entry]=newSet
                else:
                    GO_tmp[entry]=set(InterPro2Go[item])
    InterPro_GO_Hash[strain]=GO_tmp



In [18]:
PFam_GOHash={}
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_Pfam_Hash[strain]):
        for item in Emi_Pfam_Hash[strain][entry]:
            if item in Pfam2Go.keys():
                if entry in GO_tmp.keys():
                    newSet=GO_tmp[entry].union(set(Pfam2Go[item]))
                    GO_tmp[entry]=newSet
                else:
                    GO_tmp[entry]=set(Pfam2Go[item])
    PFam_GOHash[strain]=GO_tmp


    

In [97]:
CombinedGO={}
nh={}
for strain in StrainNames:
    x=PFam_GOHash[strain]
    y=InterPro_GO_Hash[strain]
    nh={}
    kx=set(x.keys())
    ky=set(y.keys())
    for ku in ky.intersection(kx):
        ns=x[ku].union(y[ku])
        nh[ku]=ns
    for ku in ky.difference(kx):
        ns=y[ku]
        nh[ku]=ns
    for ku in kx.difference(ky):
        ns=x[ku]
        nh[ku]=ns
    CombinedGO[strain]=nh
    
fin=open('Emihu1_GO.tab')
header=fin.next()
nh={}
for line in fin:
    line=line.split('\t')
    Proteinid=line[0]
    Proteinid='Emihu1|'+Proteinid
    goAcc=line[-1].strip()
    goAcc=goAcc.split('|')
    goAcc=set([goTerm.strip() for goTerm in goAcc])
    nh[Proteinid]=goAcc
CombinedGO['Emihu1']=nh
Combined_GO_Flat={}
for strain in CombinedGO.keys():
    Combined_GO_Flat.update(CombinedGO[strain])

In [98]:
Combined_GO_Flat.keys()

['Emi370|CAMPEP_0187679430',
 'Emi374|CAMPEP_0187594220',
 'Emi374|CAMPEP_0187575574',
 'Emi370|CAMPEP_0187679436',
 'Emi370|CAMPEP_0187679438',
 'Emi370|CAMPEP_0187712452',
 'Emihu1|448149',
 'Emi219|CAMPEP_0187746478',
 'Emi374|CAMPEP_0187590294',
 'Emi219|CAMPEP_0187746474',
 'Emi219|CAMPEP_0187746472',
 'Emihu1|434804',
 'Emi219|CAMPEP_0187746470',
 'Emihu1|359990',
 'Emi370|CAMPEP_0187660766',
 'Emihu1|207698',
 'Emihu1|212525',
 'Emi379|CAMPEP_0187618368',
 'Emihu1|212520',
 'Emi379|CAMPEP_0187613598',
 'Emi370|CAMPEP_0187667384',
 'Emi379|CAMPEP_0187618364',
 'Emi379|CAMPEP_0187655126',
 'Emi379|CAMPEP_0187613594',
 'Emi370|CAMPEP_0187667380',
 'Emi370|CAMPEP_0187668574',
 'Emi379|CAMPEP_0187618362',
 'Emi374|CAMPEP_0187580130',
 'Emihu1|372394',
 'Emi370|CAMPEP_0187698986',
 'Emi370|CAMPEP_0187699888',
 'Emihu1|465093',
 'Emihu1|44883',
 'Emi379|CAMPEP_0187654294',
 'Emi370|CAMPEP_0187666030',
 'Emihu1|237636',
 'Emi370|CAMPEP_0187704900',
 'Emi219|CAMPEP_0187739668',
 'Emi219|

In [60]:
OG_Hash=cpk.load(open("../../Strain_specific_analysis/Ehux_Dictionary.pickle"))


In [66]:
OG_Hash_inv={}
for key, values in OG_Hash.iteritems():
    for v in values:
        OG_Hash_inv[v]=key



In [100]:
OG_GO_Hash={}
for gene in Combined_GO_Flat:
    if gene in OG_Hash_inv:
        OG=OG_Hash_inv[gene]
        if OG in OG_GO_Hash.keys():
            OG_GO_Hash[OG]

{'GO:0005525', 'GO:0006886', 'GO:0007264', 'GO:0015031'}

In [73]:
OG_Hash

{'OG1_5_49213': ['Emi370|CAMPEP_0187675634'],
 'OG1_5_49212': ['Emi370|CAMPEP_0187720172'],
 'OG1_5_49211': ['Emi370|CAMPEP_0187675508'],
 'OG1_5_49210': ['Emi370|CAMPEP_0187719864'],
 'OG1_5_49217': ['Emi370|CAMPEP_0187720392'],
 'OG1_5_49216': ['Emi370|CAMPEP_0187720182'],
 'OG1_5_49215': ['Emi370|CAMPEP_0187720176'],
 'OG1_5_49214': ['Emi370|CAMPEP_0187675636'],
 'OG1_5_49219': ['Emi370|CAMPEP_0187676050'],
 'OG1_5_49218': ['Emi370|CAMPEP_0187720396'],
 'OG1_5_14394': ['Emi374|CAMPEP_0187605666',
  'Emi374|CAMPEP_0187592670',
  'Emihu1|198567'],
 'OG1_5_14395': ['Emi374|CAMPEP_0187593666',
  'Emi374|CAMPEP_0187607050',
  'Emi379|CAMPEP_0187633284'],
 'OG1_5_33209': ['Emi219|CAMPEP_0187774498'],
 'OG1_5_33208': ['Emi219|CAMPEP_0187774484'],
 'OG1_5_5328': ['Emi379|CAMPEP_0187634172',
  'Emi370|CAMPEP_0187733854',
  'Emi219|CAMPEP_0187758270',
  'Emihu1|117979',
  'Emihu1|107792'],
 'OG1_5_5329': ['Emi379|CAMPEP_0187627410',
  'Emi370|CAMPEP_0187674462',
  'Emi219|CAMPEP_0187765428',
