In [126]:
import pandas as pd
import cobra
from Bio import Entrez
import sys
from copy import deepcopy

# Read RNAseq

In [25]:
df_rnaseq = pd.read_excel("3_deseq2_ccr2dep_vs_wt_padj0.05.res.xlsx", index_col=0)
all_mouse_genes = list(set(df_rnaseq.index))
print("found %d genes."%(len(all_mouse_genes))) # literature indicates that mouse has around 30,000 genes, why?
df_rnaseq.head()

found 53656 genes.


Unnamed: 0,ccr2_depleted,wildtype,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Stat1,284.299639,4238.106145,1818.910529,-3.898472,0.058922,-66.163332,0.0,0.0
Cxcr2,8241.748567,18107.807382,12531.561504,-1.135729,0.020185,-56.266474,0.0,0.0
Btg2,6912.9406,15421.908871,12159.637873,-1.157757,0.02347,-49.328711,0.0,0.0
Ptprc,5591.037382,10755.033116,10484.034291,-0.944082,0.022199,-42.527642,0.0,0.0
Rgs2,9578.2677,18723.366669,17714.06544,-0.966969,0.021236,-45.533714,0.0,0.0


# Test function

In [92]:
id_list = ['11754']

Entrez.email = "*****@gmail.com"

def retrieve_annotation(id_list):

    request = Entrez.epost("gene",id=",".join(id_list))
    try:
        result = Entrez.read(request)
    except RuntimeError as e:

        print("An error occurred while retrieving the annotations.")
        print("The error returned was %s" % e)
        sys.exit(-1)

    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key =
            queryKey)
    annotations = Entrez.read(data)

    print("Retrieved %d annotations for %d genes" % (len(annotations), len(id_list)))

    return annotations

annotation=retrieve_annotation(id_list)
annotation

Retrieved 1 annotations for 1 genes


{'DocumentSummarySet': DictElement({'DocumentSummary': [DictElement({'Name': 'Aoc3', 'Description': 'amine oxidase, copper containing 3', 'Status': '0', 'CurrentID': '0', 'Chromosome': '11', 'GeneticSource': 'genomic', 'MapLocation': '11 D', 'OtherAliases': 'SSAO, VAP1', 'OtherDesignations': 'membrane primary amine oxidase|VAP-1|copper amine oxidase|semicarbazide-sensitive amine oxidase|vascular adhesion protein 1', 'NomenclatureSymbol': 'Aoc3', 'NomenclatureName': 'amine oxidase, copper containing 3', 'NomenclatureStatus': 'Official', 'Mim': [], 'GenomicInfo': [{'ChrLoc': '11', 'ChrAccVer': 'NC_000077.7', 'ChrStart': '101221431', 'ChrStop': '101230255', 'ExonCount': '6'}], 'GeneWeight': '5556', 'Summary': '', 'ChrSort': '11', 'ChrStart': '101221431', 'Organism': {'ScientificName': 'Mus musculus', 'CommonName': 'house mouse', 'TaxID': '10090'}, 'LocationHist': [{'AnnotationRelease': '109', 'AssemblyAccVer': 'GCF_000001635.27', 'ChrAccVer': 'NC_000077.7', 'ChrStart': '101221431', 'ChrSt

In [93]:
annotation['DocumentSummarySet']['DocumentSummary'][0].keys()

dict_keys(['Name', 'Description', 'Status', 'CurrentID', 'Chromosome', 'GeneticSource', 'MapLocation', 'OtherAliases', 'OtherDesignations', 'NomenclatureSymbol', 'NomenclatureName', 'NomenclatureStatus', 'Mim', 'GenomicInfo', 'GeneWeight', 'Summary', 'ChrSort', 'ChrStart', 'Organism', 'LocationHist'])

# Khodaee2020SciRep

In [98]:
model = cobra.io.read_sbml_model("../mouse_metabolic_model/Khodaee2020SciRep/iMM3254.xml")

In [99]:
all_metabolic_genes = list(set([g.name for g in model.genes]))

In [100]:
shared_genes = list(set(all_mouse_genes).intersection(set(all_metabolic_genes)))
print("found %d shared genes."%(len(shared_genes)))

found 3173 shared genes.


In [128]:
res = []
for g in model.genes:
    if g.name in shared_genes:
        res2 = [g.name]
        for db in ['mgi', 'ccds', 'ncbigene']:
            if db in g.annotation:
                res2.append(g.annotation[db])
            else:
                res2.append(None)
        res.append(res2)
df_res = pd.DataFrame(res, columns=['Symbol','MGI','CCDS','NCBI']).set_index('Symbol')
df_res.head()

Unnamed: 0_level_0,MGI,CCDS,NCBI
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aoc3,MGI:1306797,CCDS25465.1,11754
Aoc2,MGI:2668431,CCDS25464.1,237940
Aoc1,MGI:1923757,CCDS51767.1,76507
Cyp24a1,MGI:88593,CCDS17122.1,13081
Cyp27b1,MGI:1098274,CCDS24224.2,13115


In [129]:
df_res[df_res.NCBI.isnull()]

Unnamed: 0_level_0,MGI,CCDS,NCBI
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [130]:
# add genome annotation
annotations = retrieve_annotation(list(df_res.NCBI))['DocumentSummarySet']['DocumentSummary']

Retrieved 1 annotations for 3173 genes


In [131]:
df_annot = None
for annot in annotations:
    df_tmp = pd.DataFrame.from_dict(annot, orient="index", columns=['Value'])
    df_tmp = df_tmp.rename(columns=df_tmp.loc['Name'])
    if df_annot is None:
        df_annot = deepcopy(df_tmp)
    else:
        df_annot = pd.merge(df_annot, df_tmp, left_index=True, right_index=True, how='outer')
df_annot = df_annot.drop('Name').T
df_annot.head()

Unnamed: 0,Description,Status,CurrentID,Chromosome,GeneticSource,MapLocation,OtherAliases,OtherDesignations,NomenclatureSymbol,NomenclatureName,NomenclatureStatus,Mim,GenomicInfo,GeneWeight,Summary,ChrSort,ChrStart,Organism,LocationHist
Aoc3,"amine oxidase, copper containing 3",0,0,11,genomic,11 D,"SSAO, VAP1",membrane primary amine oxidase|VAP-1|copper am...,Aoc3,"amine oxidase, copper containing 3",Official,[],"[{'ChrLoc': '11', 'ChrAccVer': 'NC_000077.7', ...",5556,,11,101221431,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Aoc2,"amine oxidase, copper containing 2 (retina-spe...",0,0,11,genomic,11 D,RAO,retina-specific copper amine oxidase|retina-sp...,Aoc2,"amine oxidase, copper containing 2 (retina-spe...",Official,[],"[{'ChrLoc': '11', 'ChrAccVer': 'NC_000077.7', ...",534,,11,101214466,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Aoc1,"amine oxidase, copper-containing 1",0,0,6,genomic,6 23.78 cM,"1600012D06Rik, Abp1, DAO",amiloride-sensitive amine oxidase [copper-cont...,Aoc1,"amine oxidase, copper-containing 1",Official,[],"[{'ChrLoc': '6', 'ChrAccVer': 'NC_000072.7', '...",992,,6,48872188,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Cyp24a1,"cytochrome P450, family 24, subfamily a, polyp...",0,0,2,genomic,2 91.91 cM,"24-OHase, CP24, Cyp24","1,25-dihydroxyvitamin D(3) 24-hydroxylase, mit...",Cyp24a1,"cytochrome P450, family 24, subfamily a, polyp...",Official,[],"[{'ChrLoc': '2', 'ChrAccVer': 'NC_000068.8', '...",3845,The protein encoded by this gene localizes to ...,2,170324876,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Cyp27b1,"cytochrome P450, family 27, subfamily b, polyp...",0,0,10,genomic,10 D3,"Cp2b, Cyp1, Cyp27b, Cyp40, P450c1, Pddr, Vdd1,...","25-hydroxyvitamin D-1 alpha hydroxylase, mitoc...",Cyp27b1,"cytochrome P450, family 27, subfamily b, polyp...",Official,[],"[{'ChrLoc': '10', 'ChrAccVer': 'NC_000076.7', ...",11544,,10,126884114,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."


In [135]:
df_met_gene_w_annot = pd.merge(df_res, df_annot, left_index=True, right_index=True, how='left')
df_met_gene_w_annot.head()

Unnamed: 0_level_0,MGI,CCDS,NCBI,Description,Status,CurrentID,Chromosome,GeneticSource,MapLocation,OtherAliases,...,NomenclatureName,NomenclatureStatus,Mim,GenomicInfo,GeneWeight,Summary,ChrSort,ChrStart,Organism,LocationHist
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aoc3,MGI:1306797,CCDS25465.1,11754,"amine oxidase, copper containing 3",0,0,11,genomic,11 D,"SSAO, VAP1",...,"amine oxidase, copper containing 3",Official,[],"[{'ChrLoc': '11', 'ChrAccVer': 'NC_000077.7', ...",5556,,11,101221431,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Aoc2,MGI:2668431,CCDS25464.1,237940,"amine oxidase, copper containing 2 (retina-spe...",0,0,11,genomic,11 D,RAO,...,"amine oxidase, copper containing 2 (retina-spe...",Official,[],"[{'ChrLoc': '11', 'ChrAccVer': 'NC_000077.7', ...",534,,11,101214466,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Aoc1,MGI:1923757,CCDS51767.1,76507,"amine oxidase, copper-containing 1",0,0,6,genomic,6 23.78 cM,"1600012D06Rik, Abp1, DAO",...,"amine oxidase, copper-containing 1",Official,[],"[{'ChrLoc': '6', 'ChrAccVer': 'NC_000072.7', '...",992,,6,48872188,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Cyp24a1,MGI:88593,CCDS17122.1,13081,"cytochrome P450, family 24, subfamily a, polyp...",0,0,2,genomic,2 91.91 cM,"24-OHase, CP24, Cyp24",...,"cytochrome P450, family 24, subfamily a, polyp...",Official,[],"[{'ChrLoc': '2', 'ChrAccVer': 'NC_000068.8', '...",3845,The protein encoded by this gene localizes to ...,2,170324876,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."
Cyp27b1,MGI:1098274,CCDS24224.2,13115,"cytochrome P450, family 27, subfamily b, polyp...",0,0,10,genomic,10 D3,"Cp2b, Cyp1, Cyp27b, Cyp40, P450c1, Pddr, Vdd1,...",...,"cytochrome P450, family 27, subfamily b, polyp...",Official,[],"[{'ChrLoc': '10', 'ChrAccVer': 'NC_000076.7', ...",11544,,10,126884114,"{'ScientificName': 'Mus musculus', 'CommonName...","[{'AnnotationRelease': '109', 'AssemblyAccVer'..."


In [137]:
df_met_gene_w_annot[df_met_gene_w_annot.Description.isnull()]

Unnamed: 0_level_0,MGI,CCDS,NCBI,Description,Status,CurrentID,Chromosome,GeneticSource,MapLocation,OtherAliases,...,NomenclatureName,NomenclatureStatus,Mim,GenomicInfo,GeneWeight,Summary,ChrSort,ChrStart,Organism,LocationHist
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Col4a3bp,MGI:1915268,"[CCDS26705.1, CCDS49335.1]",68018,,,,,,,,...,,,,,,,,,,
Tsta3,MGI:98857,CCDS27555.1,22122,,,,,,,,...,,,,,,,,,,
Znrd1,MGI:1913386,CCDS28730.1,66136,,,,,,,,...,,,,,,,,,,
Dupd1,MGI:3647127,CCDS26863.1,435391,,,,,,,,...,,,,,,,,,,
Pak7,MGI:1920334,CCDS16790.1,241656,,,,,,,,...,,,,,,,,,,


In [138]:
len(df_met_gene_w_annot)

3173

In [141]:
df_met_gene_w_annot.to_csv("metabolic_genes_with_annotation.csv")

# Wang2021PNAS

In [55]:
model = cobra.io.read_sbml_model("../mouse_metabolic_model/Wang2021PNAS/Mouse-GEM.xml")

https://identifiers.org/taxonomy/ does not conform to 'http(s)://identifiers.org/collection/id' or'http(s)://identifiers.org/COLLECTION:id


In [63]:
all_metabolic_genes = list(set([g.id for g in model.genes]))

In [64]:
shared_genes = list(set(all_mouse_genes).intersection(set(all_metabolic_genes)))
print("found %d shared genes."%(len(shared_genes)))

found 2871 shared genes.


In [65]:
# it has less genes than Khodaee2020SciRep
# stop