In [1]:
import time 
import scipy.stats as stats
import numpy as np 
from multiprocessing import Pool
import json
from operator import itemgetter

In [42]:
# This is the old way to get data. New is stored in a txt file. gene2go and go2gene.
# from goatools.base import download_ncbi_associations # Only needed to download: 'gene2go'
# from goatools.anno.genetogo_reader import Gene2GoReader
# fin_gene2go = download_ncbi_associations() # Downloading associations file: 'gene2go'
# fin_gene2go = 'gene2go'
# objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Reading human associasions 
# ns2assoc = objanno.get_ns2assc() # Sorted by NS 
# assoc = objanno.get_id2gos_nss() # Not sorted by NS 

In [15]:
from obonet import read_obo

with open('go2gene.txt', 'r') as file:
     go2gene = json.loads(file.read())
        
with open('genes2Ontology.txt', 'r') as file:
     genes2go = json.loads(file.read())
genes2go = {int(k):(v) for k,v in genes2go.items()} # Converts key back to int 

goinfo = read_obo('go-basic.obo')

In [3]:
# Returns all GO terms associated with geneL input. Has no duplicates 
def getOntologyID(geneIDL):
    GOList = []
    for gene in geneIDL:
        if gene in genes2go.keys():
            terms = genes2go[gene]
            GOList += terms 
    return list(set(GOList)) 

In [4]:
# Finds common between two lists and returns its length 0 if none 
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        return list(a_set & b_set) 
    else: 
        return []

In [5]:
# Finds common between two lists and returns its common members or empty list
def listCommonMember(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        return list(a_set & b_set)
    else: 
        return []

In [6]:
# Returns genes that have go term 
# This isn't needed anymore! slow 
def getKeysByValue(dictOfElements, valueToFind):
    listOfKeys = list()
    listOfItems = dictOfElements.items()
    for item in listOfItems:
        if valueToFind in list(item[1]):
            listOfKeys.append(item[0])
    return listOfKeys

In [7]:
# Returns dict with entrezID and its corresponding domain length
def getEntrezDomain(frame):
    entrezIDS = []
    domainVAL = []
    for df in frame:
        entrezS = list(df.entrezid.values)    
        domainS = list(df.domainLEN.values)
        entrezIDS += entrezS
        domainVAL += domainS
    out = {}
    for i in range(len(entrezIDS)):
        out.update({entrezIDS[i]:domainVAL[i]})
    return out

In [8]:
# Returns S M L Counters for every go term using list of mapped genes 
def SMLcounters(GOlist, mappedList, SMLcutoff, entrezDomains):
    minimum = SMLcutoff[0]
    maximum = SMLcutoff[1]
    go2counter = {}
    for go in GOlist:
        domains = []
        for gene in go2gene[go]:
            try: 
                domains.append(entrezDomains[gene])
            except:
                None
        smallGenes = len(list(filter((lambda x: x <= minimum), domains)))
        mediumGenes = len(list(filter((lambda x: x > minimum and x <= maximum), domains)))
        largeGenes = len(list(filter((lambda x: x > maximum), domains)))
        s = 0
        m = 0
        l = 0
        common = listCommonMember(go2gene[go], mappedList)
        for entrezid in common:
            domain = entrezDomains[entrezid]
            if domain <= minimum:
                s += 1
            elif domain > maximum:
                l += 1 
            else:
                m += 1
        if smallGenes == 0:
            s = 'NA'
        if mediumGenes == 0:
            m = 'NA'
        if largeGenes == 0:
            l = 'NA'
        go2counter.update({go:[s,m,l]})
    return go2counter

In [9]:
def SMLcounting(inputs):
    GOlist = inputs[0]
    mappedList = inputs[1]
    SMLcutoff = inputs[2]
    entrezDomains = inputs[3]
    minimum = SMLcutoff[0]
    maximum = SMLcutoff[1]
    go2counter = {}
    
    for go in GOlist:
        s = 0
        m = 0
        l = 0
        common = listCommonMember(go2gene[go], mappedList)
        for entrezid in common:
            domain = entrezDomains[entrezid]
            if domain < minimum:
                s += 1
            elif domain > maximum:
                l += 1 
            else:
                m += 1
        go2counter.update({go:[s,m,l]})
    return go2counter

In [10]:
# SML counter with multiprocessing 
def SMLcounterFAST(GOlist, mappedList, SMLcutoff, entrezDomains):
    x = int(len(GOlist) / 5)
    x1 = GOlist[:x]
    x2 = GOlist[x:x*2]
    x3 = GOlist[x*2:x*3]
    x4 = GOlist[x*3:x*4]
    x5 = GOlist[x*4:]
    
    pool = Pool(5)
    result = pool.map(SMLcounting, [[x1, mappedList, SMLcutoff, entrezDomains], [x2, mappedList, SMLcutoff, entrezDomains], [x3, mappedList, SMLcutoff, entrezDomains], [x4, mappedList, SMLcutoff, entrezDomains], [x5, mappedList, SMLcutoff, entrezDomains]])
    pool.close() 
    
    counters = {}
    for res in result:
        counters.update(res)
    return counters 
    

In [11]:
# Conducts GO analysis to find odds ratio and p-value for each go term. 
def conductAnalysisFIRST(mappedGeneL, SMLclassification, entrezDomains):
    geneNum = 21294 # Num of genes intotal in DF - the duplicates
    mappedGeneNum = len(mappedGeneL)
    OntologyL = getOntologyID(mappedGeneL)
    goTermAnalysis = {}
    
    for go in OntologyL:
        genesAssocGO = go2gene[go] 
        A = common_member(genesAssocGO, mappedGeneL) # Num of common members between two lists  
        B = mappedGeneNum - A  # Num of genes in mapped list but not in go gene list 
        C = len(genesAssocGO) - A # Num genes in go gene list but not in mapped list 
        D = geneNum - (A+B+C) # Num total genes in genome - (A+B+C) = D 
        oddsratio, pvalue = stats.fisher_exact([[A, B], [C, D]])
        out = [oddsratio, pvalue] # 0 represents small, medium and large
        goTermAnalysis.update({go:out})
   
    counters = SMLcounters(OntologyL, mappedGeneL, SMLclassification, entrezDomains)
#     return counters
    for go in goTermAnalysis.keys():
        goTermAnalysis[go] += counters[go]
        goTermAnalysis[go] += [0, 0, 0]
        
    return goTermAnalysis

In [12]:
# This and the one below isn't being used at the moment 
def analysis(inputs):
    goTerms = inputs[0]
    mappedGeneNum = inputs[1]
    mappedGeneL = inputs[2]
    SML = inputs[3]
    entrezDomains = inputs[4]
    geneNum = 21294 
    
    goTermAnalysis = {}
    for go in goTerms:
        
        out = [0, 0, 0] # 0 represents small, medium and large
        goTermAnalysis.update({go:out})
        
    return goTermAnalysis

In [13]:
# Conducts GO analysis to find odds ratio and p-value for each go term. 
def simConductAnalysisFast(geneL, OntologyL, SML, entrezDomains):
    mappedGeneNum = len(geneL)

    goTermAnalysis = {}
    
    x = int(len(OntologyL) / 5)
    x1 = OntologyL[:x]
    x2 = OntologyL[x:x*2]
    x3 = OntologyL[x*2:x*3]
    x4 = OntologyL[x*3:x*4]
    x5 = OntologyL[x*4:]
    pool = Pool(5)
    result = pool.map(analysis, [[x1, mappedGeneNum, geneL, SML, entrezDomains], [x2, mappedGeneNum, geneL, SML, entrezDomains], [x3, mappedGeneNum, geneL, SML, entrezDomains], [x4, mappedGeneNum, geneL, SML, entrezDomains], [x5, mappedGeneNum, geneL, SML, entrezDomains]])
    pool.close() 
    
    for res in result:
        goTermAnalysis.update(res)
    
    return goTermAnalysis
