##GOAL: Get the tentative GO annotations for the Ehux transcriptomes from the Pfam and InterproData a la JGI 

In [22]:
import pandas as pd
import matplotlib.pyplot as mpl 
from Bio import SeqIO
import cPickle as cpk
%matplotlib inline

In [3]:
##Load in interpro hash

#Form : Interpro2Go[Interpro]=[GOterms]
InterPro2Go={}
f=open("Interpro2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        InterPro=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if InterPro in InterPro2Go.keys():
            InterPro2Go[InterPro].append(GO)
        else:
            InterPro2Go[InterPro]=[GO]
    
            
##Load in Pfam hash

#Form : Pfam2Go[Interpro]=[GOterms]
Pfam2Go={}
f=open("Pfam2Go", 'r')
for line in f:
    if line.startswith("!"):
        pass
    else: 
        l = line.split(" ")
        Pfam=l[0].split(":")[1].strip()
        t=line.split(';')
        GO=t[1].strip()
        if Pfam in Pfam2Go.keys():
            Pfam2Go[Pfam].append(GO)
        else:
            Pfam2Go[Pfam]=[GO]
            

In [4]:
#Emi219|CAMPEP_0187750186
def nameConversion(inPepFile):
    f=open(inPepFile, 'r')
    NCGRIDConv={}
    strainNum=[s for s in inPepFile if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    for item in SeqIO.parse(f, 'fasta'):
        CAMPEP=item.id
        desc=item.description.strip()
        NCGRID=desc.split('/')[1].split('=')[1].strip()
        NCGRIDConv[NCGRID]=StrainName+'|'+CAMPEP
    return NCGRIDConv, StrainName

def GetInterPro(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Interpro_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'InterPro' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            InterPro=l[1].split(';')[3].split('=')
            Inter_list=InterPro[1].split(',')
            InterPro_finalList=[]
            for i in Inter_list:
                ii=i.split(':')[1]
                InterPro_finalList.append(ii)
            InterPro_finalList=set(InterPro_finalList)
            if name in Interpro_Hash:
                Interpro_Hash[name]=Interpro_Hash[name].union(InterPro_finalList)
            else: 
                Interpro_Hash[name]=InterPro_finalList
    return Interpro_Hash

def GetPfam(infile, rename=False, conHash=None):
    #loos through interpro.gff3 file and exracts interpro names
    #Get the list of interpro tags assocaited with each gene
    #Output: Hash[geneID]=set(Interpro)
    Pfam_Hash={}
    f=open(infile, 'r')
    for i,line in enumerate(f):
        if line.startswith('#'):
            pass
        elif 'Pfam:' in line: 
            l=line.split('\t')
            name=l[0]
            if rename:
                name=conHash[name]
            Pfam=l[1].split(';')[1].split('=')
            Pfam_list=Pfam[1].split(',')
            Pfam_finalList=[]
            for i in Pfam_list:
                ii=i.split(':')[1].split('.')[0]
                Pfam_finalList.append(ii)
            Pfam_finalList=set(Pfam_finalList)
            if name in Pfam_Hash:
                Pfam_Hash[name]=Pfam_Hash[name].union(Pfam_finalList)
            else: 
                Pfam_Hash[name]=Pfam_finalList
    return Pfam_Hash


test=GetPfam('Emi219/pfam.all')

In [5]:
#Get all the interpro calls
import glob
Emi_InterPro_Hash={}
Emi_Pfam_Hash={}
NameConHash={}
StrainNames=[]
for file in glob.glob('*pep*'):
    strainNum=[s for s in file if s.isdigit()]
    StrainName='Emi'+''.join(strainNum)
    StrainNames.append(StrainName)
    NameConHash[StrainName], StrainName=nameConversion(file)
    Emi_InterPro_Hash[StrainName]=GetInterPro(StrainName+'/interpro.all', rename=True, conHash=NameConHash[StrainName])
    Emi_Pfam_Hash[StrainName]=GetPfam(StrainName+'/pfam.all', rename=True, conHash=NameConHash[StrainName])


In [6]:
#Convert interpro to GO
InterPro_GO_Hash={}
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_InterPro_Hash[strain]):
        for item in Emi_InterPro_Hash[strain][entry]:
            if item in InterPro2Go.keys():
                if entry in GO_tmp.keys():
                    newSet=GO_tmp[entry].union(set(InterPro2Go[item]))
                    GO_tmp[entry]=newSet
                else:
                    GO_tmp[entry]=set(InterPro2Go[item])
    InterPro_GO_Hash[strain]=GO_tmp



In [7]:
PFam_GOHash={}
for strain in StrainNames:
    GO_tmp={}
    for i,entry in enumerate(Emi_Pfam_Hash[strain]):
        for item in Emi_Pfam_Hash[strain][entry]:
            if item in Pfam2Go.keys():
                if entry in GO_tmp.keys():
                    newSet=GO_tmp[entry].union(set(Pfam2Go[item]))
                    GO_tmp[entry]=newSet
                else:
                    GO_tmp[entry]=set(Pfam2Go[item])
    PFam_GOHash[strain]=GO_tmp


    

In [8]:
CombinedGO={}
nh={}
for strain in StrainNames:
    x=PFam_GOHash[strain]
    y=InterPro_GO_Hash[strain]
    nh={}
    kx=set(x.keys())
    ky=set(y.keys())
    for ku in ky.intersection(kx):
        ns=x[ku].union(y[ku])
        nh[ku]=ns
    for ku in ky.difference(kx):
        ns=y[ku]
        nh[ku]=ns
    for ku in kx.difference(ky):
        ns=x[ku]
        nh[ku]=ns
    CombinedGO[strain]=nh
    
fin=open('Emihu1_GO.tab')
header=fin.next()
nh={}
for line in fin:
    line=line.split('\t')
    Proteinid=line[0]
    Proteinid='Emihu1|'+Proteinid
    goAcc=line[-1].strip()
    goAcc=goAcc.split('|')
    goAcc=set([goTerm.strip() for goTerm in goAcc])
    nh[Proteinid]=goAcc
CombinedGO['Emihu1']=nh
Combined_GO_Flat={}
for strain in CombinedGO.keys():
    Combined_GO_Flat.update(CombinedGO[strain])

In [9]:
Combined_GO_Flat.keys()

['Emi370|CAMPEP_0187679430',
 'Emi374|CAMPEP_0187594220',
 'Emi374|CAMPEP_0187575574',
 'Emi370|CAMPEP_0187679436',
 'Emi370|CAMPEP_0187679438',
 'Emi370|CAMPEP_0187712452',
 'Emihu1|448149',
 'Emi219|CAMPEP_0187746478',
 'Emi374|CAMPEP_0187590294',
 'Emi219|CAMPEP_0187746474',
 'Emi219|CAMPEP_0187746472',
 'Emihu1|434804',
 'Emi219|CAMPEP_0187746470',
 'Emihu1|359990',
 'Emi370|CAMPEP_0187660766',
 'Emihu1|207698',
 'Emihu1|212525',
 'Emi379|CAMPEP_0187618368',
 'Emihu1|212520',
 'Emi379|CAMPEP_0187613598',
 'Emi370|CAMPEP_0187667384',
 'Emi379|CAMPEP_0187618364',
 'Emi379|CAMPEP_0187655126',
 'Emi379|CAMPEP_0187613594',
 'Emi370|CAMPEP_0187667380',
 'Emi370|CAMPEP_0187668574',
 'Emi379|CAMPEP_0187618362',
 'Emi374|CAMPEP_0187580130',
 'Emihu1|372394',
 'Emi370|CAMPEP_0187698986',
 'Emi370|CAMPEP_0187699888',
 'Emihu1|465093',
 'Emihu1|44883',
 'Emi379|CAMPEP_0187654294',
 'Emi370|CAMPEP_0187666030',
 'Emihu1|237636',
 'Emi370|CAMPEP_0187704900',
 'Emi219|CAMPEP_0187739668',
 'Emi219|

In [23]:
OG_Hash=cpk.load(open("../../Strain_specific_analysis/Ehux_Dictionary.pickle"))


In [15]:
OG_Hash_inv={}
for key, values in OG_Hash.iteritems():
    for v in values:
        OG_Hash_inv[v]=key



In [18]:
##Create a hash with each of the 

OG_GO_Hash={}
for i,OG in enumerate(OG_Hash.keys()):
    genes=OG_Hash[OG]
    vals=[Combined_GO_Flat.get(k) for k in genes]
    vals=[x for x in vals if x is not None]
    if vals==[]:
        pass
    else:
        OG_GO_Hash[OG]=set.union(*vals)


In [21]:
#Write out a .annot file to use in the GOSeq package
outfile = "Ehux_OG_GO.annot"
f = open(outfile, 'w')
for OG in OG_GO_Hash:
    for GO in OG_GO_Hash[OG]:
        f.write(OG)
        f.write('\t')
        f.write(GO)
        f.write('\n')

In [69]:
GO_Hash_inv={}
for key, values in OG_GO_Hash.iteritems():
    for v in values:
        if v in GO_Hash_inv:
            GO_Hash_inv[v].append(key)
        else:        
            GO_Hash_inv[v]=[key]

GO_Hash_inv

{'GO:0046439': ['OG1_5_27082', 'OG1_5_10033'],
 'GO:0042410': ['OG1_5_47285'],
 'GO:0051188': ['OG1_5_9639',
  'OG1_5_7511',
  'OG1_5_56121',
  'OG1_5_5818',
  'OG1_5_8576',
  'OG1_5_1388'],
 'GO:0004141': ['OG1_5_3696', 'OG1_5_10895'],
 'GO:0004639': ['OG1_5_8685'],
 'GO:0043828': ['OG1_5_6162', 'OG1_5_5116'],
 'GO:0005838': ['OG1_5_2699'],
 'GO:0005839': ['OG1_5_4300',
  'OG1_5_5825',
  'OG1_5_7229',
  'OG1_5_41741',
  'OG1_5_36924',
  'OG1_5_1642',
  'OG1_5_10740',
  'OG1_5_7979',
  'OG1_5_1796',
  'OG1_5_6171',
  'OG1_5_6980',
  'OG1_5_2130',
  'OG1_5_27503',
  'OG1_5_1775',
  'OG1_5_7972',
  'OG1_5_2258'],
 'GO:0004638': ['OG1_5_7777', 'OG1_5_1089', 'OG1_5_34986', 'OG1_5_53323'],
 'GO:0005834': ['OG1_5_11410', 'OG1_5_37176', 'OG1_5_36367'],
 'GO:0005835': ['OG1_5_7127'],
 'GO:0004814': ['OG1_5_10352', 'OG1_5_7950', 'OG1_5_50584'],
 'GO:0030655': ['OG1_5_18238'],
 'GO:0016512': ['OG1_5_2495', 'OG1_5_37134'],
 'GO:0046509': ['OG1_5_5186', 'OG1_5_2446'],
 'GO:0016215': ['OG1_5_3510',

In [77]:
#Write out a .annot file to use in the Enrichment Map package
outfile = "Ehux_OG_GO.enrichment.gmt"
f = open(outfile, 'w')
for GO in GO_Hash_inv.keys():
    f.write(GO)
    f.write('\t')
    f.write('text')
    f.write('\t')

    Os=' '.join(GO_Hash_inv[GO])
    f.write(Os)
    f.write('\n')

{'GO:0046439': ['OG1_5_27082', 'OG1_5_10033'],
 'GO:0042410': ['OG1_5_47285'],
 'GO:0051188': ['OG1_5_9639',
  'OG1_5_7511',
  'OG1_5_56121',
  'OG1_5_5818',
  'OG1_5_8576',
  'OG1_5_1388'],
 'GO:0004141': ['OG1_5_3696', 'OG1_5_10895'],
 'GO:0004639': ['OG1_5_8685'],
 'GO:0043828': ['OG1_5_6162', 'OG1_5_5116'],
 'GO:0005838': ['OG1_5_2699'],
 'GO:0005839': ['OG1_5_4300',
  'OG1_5_5825',
  'OG1_5_7229',
  'OG1_5_41741',
  'OG1_5_36924',
  'OG1_5_1642',
  'OG1_5_10740',
  'OG1_5_7979',
  'OG1_5_1796',
  'OG1_5_6171',
  'OG1_5_6980',
  'OG1_5_2130',
  'OG1_5_27503',
  'OG1_5_1775',
  'OG1_5_7972',
  'OG1_5_2258'],
 'GO:0004638': ['OG1_5_7777', 'OG1_5_1089', 'OG1_5_34986', 'OG1_5_53323'],
 'GO:0005834': ['OG1_5_11410', 'OG1_5_37176', 'OG1_5_36367'],
 'GO:0005835': ['OG1_5_7127'],
 'GO:0004814': ['OG1_5_10352', 'OG1_5_7950', 'OG1_5_50584'],
 'GO:0030655': ['OG1_5_18238'],
 'GO:0016512': ['OG1_5_2495', 'OG1_5_37134'],
 'GO:0046509': ['OG1_5_5186', 'OG1_5_2446'],
 'GO:0016215': ['OG1_5_3510',

Get the lengths of the OG families for the enrichemnet analysis 

In [36]:
from Bio import SeqIO
import numpy as np

In [30]:
Seq_lenghtHash={}
for record in SeqIO.parse(open("../../Ehux_All_Transcripts_Cleaned_RSEM.nt.fa", 'rU'), 'fasta'):
    Seq_lenghtHash[record.id] = len(record.seq)

In [32]:
GenesForEnrichment = []
for line in open("../../TranscriptomeAnnotation/EnrichmentAnalysis/All_Genes_Measured.tab"):
    GenesForEnrichment.append(line.strip())

In [54]:
lenHash={}
for i, OG in enumerate(GenesForEnrichment):
    gL = []
    for gene in OG_Hash[OG]:
        gL.append(Seq_lenghtHash[gene]) 
    gL=np.asarray(gL)
    l=int(gL.mean().mean().round())
    lenHash[OG] = l
    

In [60]:
GF=open("All_Gene_List.tab", 'w')
LF=open("All_Gene_Length.tab", 'w')
for OG in lenHash.keys():
    GF.write(OG)
    GF.write('\n')
    LF.write(str(lenHash[OG]))
    LF.write('\n')
GF.close()
LF.close()