# Retrieve the human-mouse orthologous genes and canonical transcripts  
This script:
- Retrieves the complete list of human genes (protein coding genes, mitochondrial genes, genetic regulatory elements, etc) and the orthologous status in the mouse genome via Ensembl BioMart data mining tool, combined with all entries in MGI (Mouse Genome Informatics) Database.  
- Extract ENSEMBL gene ID and canonical transcript ID for each gene with reference to GRCh37, GRCh38 and GRCm39 genome assembly via Ensembl API.  

**Author**: Kexin Dong  
**Date**: Feb 4, 2024  

## Load packages and data

In [1]:
import numpy as np
import pandas as pd
import gffutils
import json
from pybiomart import Server
from collections import Counter
import requests
import h2m



# Download complete homology list between human and mouse from MGI:  
https://www.informatics.jax.org/downloads/reports/index.html#homology  

In [2]:
path = '/Users/kexindong/Documents/GitHub/Database/Homology/HMD_HumanPhenotype.txt'
df_mgi = pd.read_csv(path, header=0, sep='\t', comment="#", index_col=False)
df_mgi = df_mgi[['Human Marker Symbol','Mouse Marker Symbol']]
df_mgi

Unnamed: 0,Human Marker Symbol,Mouse Marker Symbol
0,A1BG,A1bg
1,A1CF,A1cf
2,A2M,A2m
3,A3GALT2,A3galt2
4,A4GALT,A4galt
...,...,...
29594,ZYG11A,Zyg11a
29595,ZYG11B,Zyg11b
29596,ZYX,Zyx
29597,ZZEF1,Zzef1


In [3]:
server_dict = {37:'http://grch37.ensembl.org', 38:'http://www.ensembl.org'}
# server for different ref-genomes
server_dic = {37:'http://grch37.rest.ensembl.org/',
              38:'https://rest.ensembl.org/'}
species_dic = {'h':'homo_sapiens','m':'mouse'}

Download GENCODE annotation data from this [dropboxlink](https://www.dropbox.com/scl/fo/1wtrnc9w6s9gemweuw2fv/h?rlkey=hli1z6tv096cjwit5oi6bwggg&dl=0)

In [4]:
path_h_anno, path_m_anno = '/Users/kexindong/Documents/GitHub/Database/Genecode/gencode_v19_GRCh37.db', '/Users/kexindong/Documents/GitHub/Database/Genecode/gencode_vm33_GRCm39.db'
# remember to replace the paths with yours
db_h, db_m = h2m.anno_loader(path_h_anno), h2m.anno_loader(path_m_anno)

# Retreive homology list from Ensmbl Assembly version 38 via API, getting gene id and transcript ids at the same time.  

In [5]:
# make a whole list of human-mouse homology status via biomart
# Only the 38 database provides homology information
ver = 38
server = server_dict[ver]
server_human = Server(server)
dataset_human = server_human['ENSEMBL_MART_ENSEMBL']['hsapiens_gene_ensembl']
query_index = ['external_gene_name',
 'ensembl_gene_id',
 'ensembl_gene_id_version',
 'mmusculus_homolog_ensembl_gene',
 'mmusculus_homolog_associated_gene_name',
 'mmusculus_homolog_orthology_type', 
 'mmusculus_homolog_perc_id']
df_query = dataset_human.query(attributes=query_index)
df_query.columns = ['gene_name_h','gene_id_h_38', 'gene_id_h_version_38','gene_id_m','gene_name_m', 'homology_type', 'similarity']

In [6]:
df_query

Unnamed: 0,gene_name_h,gene_id_h_38,gene_id_h_version_38,gene_id_m,gene_name_m,homology_type,similarity
0,MT-TF,ENSG00000210049,ENSG00000210049.1,,,,
1,MT-RNR1,ENSG00000211459,ENSG00000211459.2,,,,
2,MT-TV,ENSG00000210077,ENSG00000210077.1,,,,
3,MT-RNR2,ENSG00000210082,ENSG00000210082.2,,,,
4,MT-TL1,ENSG00000209082,ENSG00000209082.1,,,,
...,...,...,...,...,...,...,...
77797,,ENSG00000288629,ENSG00000288629.1,,,,
77798,,ENSG00000288678,ENSG00000288678.2,,,,
77799,DDX11L2,ENSG00000290825,ENSG00000290825.1,,,,
77800,WASH7P,ENSG00000227232,ENSG00000227232.6,,,,


In [7]:
# make a whole list of human-mouse homology status via biomart
# Only the 38 database provides homology information
ver = 37
server = server_dict[ver]
server_human = Server(server)
dataset_human = server_human['ENSEMBL_MART_ENSEMBL']['hsapiens_gene_ensembl']
query_index = ['external_gene_name','ensembl_gene_id','ensembl_gene_id_version']
df_query_37 = dataset_human.query(attributes=query_index)
df_query_37.columns = ['gene_name_h','gene_id_h_37','gene_id_h_version_37']
df_query = pd.merge(df_query,df_query_37, on='gene_name_h', how='left')
df_query = df_query.iloc[:,[0,-2,-1,1,2]+list(range(3,7))]

# Bind MGI result and Ensembl result.   

## Attach human gene ids for MGI data.  

In [8]:
df_mgi.columns = ['gene_name_h','gene_name_m']
df_mgi = df_mgi.drop_duplicates().reset_index(drop=True)
df_mgi = pd.merge(df_mgi, df_query[['gene_name_h','gene_id_h_37','gene_id_h_version_37','gene_id_h_38','gene_id_h_version_38']], on = 'gene_name_h', how = 'left')
df_mgi = df_mgi.drop_duplicates().reset_index(drop=True)

In [9]:
df_mgi_unique = df_mgi.drop_duplicates(['gene_name_h','gene_name_m']).reset_index(drop=True).dropna(subset = 'gene_id_h_38').reset_index(drop=True)

In [10]:
df_query_unique = df_query.dropna(subset = 'gene_id_m').reset_index(drop=True)

## Part 1: Ensembl-only records  

In [11]:
# df_query_unique_human_not_in_mgi
part_1 = df_query_unique[~df_query_unique['gene_name_h'].isin(df_mgi_unique['gene_name_h'])]

In [12]:
part_1 = part_1.drop_duplicates(['gene_name_h','gene_name_m']).reset_index(drop=True)

In [13]:
part_1

Unnamed: 0,gene_name_h,gene_id_h_37,gene_id_h_version_37,gene_id_h_38,gene_id_h_version_38,gene_id_m,gene_name_m,homology_type,similarity
0,MT-ND1,ENSG00000198888,ENSG00000198888.2,ENSG00000198888,ENSG00000198888.2,ENSMUSG00000064341,mt-Nd1,ortholog_one2one,77.0440
1,MT-ND2,ENSG00000198763,ENSG00000198763.3,ENSG00000198763,ENSG00000198763.3,ENSMUSG00000064345,mt-Nd2,ortholog_one2one,57.0605
2,MT-CO1,ENSG00000198804,ENSG00000198804.2,ENSG00000198804,ENSG00000198804.2,ENSMUSG00000064351,mt-Co1,ortholog_one2one,90.8382
3,MT-CO2,ENSG00000198712,ENSG00000198712.1,ENSG00000198712,ENSG00000198712.1,ENSMUSG00000064354,mt-Co2,ortholog_one2one,71.3656
4,MT-ATP8,ENSG00000228253,ENSG00000228253.1,ENSG00000228253,ENSG00000228253.1,ENSMUSG00000064356,mt-Atp8,ortholog_one2one,45.5882
...,...,...,...,...,...,...,...,...,...
2875,RNU5A-5P,ENSG00000222986,ENSG00000222986.1,ENSG00000222986,ENSG00000222986.1,ENSMUSG00000070263,Gm22365,ortholog_many2many,73.2759
2876,RNU5A-5P,ENSG00000222986,ENSG00000222986.1,ENSG00000222986,ENSG00000222986.1,ENSMUSG00000096766,Gm23793,ortholog_many2many,75.0000
2877,RNU5A-5P,ENSG00000222986,ENSG00000222986.1,ENSG00000222986,ENSG00000222986.1,ENSMUSG00000065658,Gm23102,ortholog_many2many,65.5172
2878,RN7SKP91,ENSG00000222784,ENSG00000222784.1,ENSG00000222784,ENSG00000222784.1,ENSMUSG00002075912,Gm56340,ortholog_one2many,53.1250


In [14]:
df_query_unique_human_in_mgi = pd.merge(df_mgi_unique[['gene_name_h','gene_name_m']], df_query_unique, on = ['gene_name_h','gene_name_m'], how = 'left')

## Part 2: shared records  

In [15]:
# intersection
part_2 = pd.merge(df_mgi_unique[['gene_name_h','gene_name_m']], df_query_unique, on = ['gene_name_h','gene_name_m'], how = 'inner')
part_2 = part_2.drop_duplicates(['gene_name_h','gene_name_m']).reset_index(drop=True)

## Part 3: MGI-only records, need to attach mouse IDs there

In [16]:
part_3 = df_query_unique_human_in_mgi.merge(part_2, indicator=True, how='outer')
# df_unique_mgi_not_in_query
part_3 = part_3[part_3['_merge']=='left_only']
part_3 = pd.merge(part_3[['gene_name_h','gene_name_m']], df_mgi_unique, how = 'inner')
part_3 = part_3.drop_duplicates(['gene_name_h','gene_name_m']).reset_index(drop=True)

In [17]:
part_2 = part_2[part_1.columns]
part_return = pd.concat([part_1,part_2]).sort_values(by = ['gene_name_h','similarity'], ascending=[True, False])
part_return = part_return.reset_index(drop=True)

In [18]:
dict_of_m_gene_id = dict(zip(part_return['gene_name_m'], part_return['gene_id_m']))

In [19]:
part_return

Unnamed: 0,gene_name_h,gene_id_h_37,gene_id_h_version_37,gene_id_h_38,gene_id_h_version_38,gene_id_m,gene_name_m,homology_type,similarity
0,5S_rRNA,ENSG00000263172,ENSG00000263172.1,ENSG00000277488,ENSG00000277488.1,ENSMUSG00002076113,Gm55778,ortholog_one2many,56.3218
1,5_8S_rRNA,,,ENSG00000278294,ENSG00000278294.1,ENSMUSG00002075795,Gm54867,ortholog_one2many,75.6579
2,7SK,ENSG00000271818,ENSG00000271818.1,ENSG00000202198,ENSG00000202198.1,ENSMUSG00002076400,Gm54707,ortholog_one2many,58.3082
3,7SK,ENSG00000271818,ENSG00000271818.1,ENSG00000271394,ENSG00000271394.1,ENSMUSG00002075014,Gm55081,ortholog_one2one,33.6032
4,A1BG,ENSG00000121410,ENSG00000121410.7,ENSG00000121410,ENSG00000121410.12,ENSMUSG00000022347,A1bg,ortholog_one2one,45.6566
...,...,...,...,...,...,...,...,...,...
23450,,,,ENSG00000200991,ENSG00000200991.1,ENSMUSG00002076490,Gm54634,ortholog_one2many,37.2881
23451,,,,ENSG00000283599,ENSG00000283599.3,ENSMUSG00000046774,8030474K03Rik,ortholog_many2many,35.1351
23452,,,,ENSG00000252677,ENSG00000252677.1,ENSMUSG00000088135,Gm22588,ortholog_one2one,34.7059
23453,,,,ENSG00000283599,ENSG00000283599.3,ENSMUSG00000071749,4933412E24Rik,ortholog_many2many,34.3629


In [20]:
part_3

Unnamed: 0,gene_name_h,gene_name_m,gene_id_h_37,gene_id_h_version_37,gene_id_h_38,gene_id_h_version_38
0,AADACL2,Aadacl2,ENSG00000261846,ENSG00000261846.1,ENSG00000261846,ENSG00000261846.2
1,AADACL4,Aadacl4fm2,ENSG00000204518,ENSG00000204518.2,ENSG00000204518,ENSG00000204518.3
2,AARS1,Aars1,,,ENSG00000090861,ENSG00000090861.17
3,ABCA17P,Abca17,ENSG00000238098,ENSG00000238098.4,ENSG00000238098,ENSG00000238098.10
4,ABCA8,Abca8a,ENSG00000141338,ENSG00000141338.9,ENSG00000141338,ENSG00000141338.15
...,...,...,...,...,...,...
10122,ZSCAN4,Zscan4e,ENSG00000180532,ENSG00000180532.6,ENSG00000180532,ENSG00000180532.12
10123,ZSCAN4,Zscan4f,ENSG00000180532,ENSG00000180532.6,ENSG00000180532,ENSG00000180532.12
10124,ZSCAN5DP,Zscan5b,,,ENSG00000267908,ENSG00000267908.2
10125,ZXDA,Zxdb,ENSG00000198205,ENSG00000198205.5,ENSG00000198205,ENSG00000198205.7


In [27]:
def get_gene_id(species, gene_name):
    # URL for the Ensembl REST API lookup service
    url = f"https://rest.ensembl.org/lookup/symbol/{species}/{gene_name}?content-type=application/json"
    # Make the request
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        return data['id']
    else:
        # Handle errors (e.g., gene not found or bad request)
        print(f"Error: HTTP {response.status_code} - {response.text}")
        return None

In [28]:
list_gene_name_m = part_3['gene_name_m'].unique()
list_gene_id_m = []
species = 'mus_musculus'
for i,m in enumerate(list_gene_name_m):
    print(f'{i+1}/{len(list_gene_name_m)}\r')
    if m in dict_of_m_gene_id.keys():
        gene_id = dict_of_m_gene_id[m]
    try:
        gene_id = get_gene_id(species, m)
    except:
        gene_id = None
    list_gene_id_m.append(gene_id)

1/3475
2/3475
3/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Aars1"}
4/3475
5/3475
6/3475
7/3475
8/3475
9/3475
10/3475
11/3475
12/3475
13/3475
14/3475
15/3475
16/3475
17/3475
18/3475
19/3475
20/3475
21/3475
22/3475
23/3475
24/3475
25/3475
26/3475
27/3475
28/3475
29/3475
30/3475
31/3475
32/3475
33/3475
34/3475
35/3475
36/3475
37/3475
38/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Adss1"}
39/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Adss2"}
40/3475
41/3475
42/3475
43/3475
44/3475
45/3475
46/3475
47/3475
48/3475
49/3475
50/3475
51/3475
52/3475
53/3475
54/3475
55/3475
56/3475
57/3475
58/3475
59/3475
60/3475
61/3475
62/3475
63/3475
64/3475
65/3475
66/3475
67/3475
68/3475
69/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Alyreffm1"}
70/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Alyreffm10"}
71/3475
Error: HTTP 400 - {"error":"No valid lookup found for symbol Alyreffm11"}
72/3475
Error

In [29]:
dict_of_m_gene_id.update(zip(list_gene_name_m, list_gene_id_m))
part_3['gene_id_m'] = [dict_of_m_gene_id[x] for x in part_3['gene_name_m']]

In [30]:
part_3['gene_id_m'] = [h2m.dict_tx_m[x] for x in part_3['gene_name_m']]

In [32]:
part_3

Unnamed: 0,gene_name_h,gene_name_m,gene_id_h_37,gene_id_h_version_37,gene_id_h_38,gene_id_h_version_38,gene_id_m
0,AADACL2,Aadacl2,ENSG00000261846,ENSG00000261846.1,ENSG00000261846,ENSG00000261846.2,ENSMUSG00000091376
1,AADACL4,Aadacl4fm2,ENSG00000204518,ENSG00000204518.2,ENSG00000204518,ENSG00000204518.3,ENSMUSG00000078506
2,AARS1,Aars1,,,ENSG00000090861,ENSG00000090861.17,
3,ABCA17P,Abca17,ENSG00000238098,ENSG00000238098.4,ENSG00000238098,ENSG00000238098.10,ENSMUSG00000035435
4,ABCA8,Abca8a,ENSG00000141338,ENSG00000141338.9,ENSG00000141338,ENSG00000141338.15,ENSMUSG00000041828
...,...,...,...,...,...,...,...
10122,ZSCAN4,Zscan4e,ENSG00000180532,ENSG00000180532.6,ENSG00000180532,ENSG00000180532.12,ENSMUSG00000095936
10123,ZSCAN4,Zscan4f,ENSG00000180532,ENSG00000180532.6,ENSG00000180532,ENSG00000180532.12,ENSMUSG00000070828
10124,ZSCAN5DP,Zscan5b,,,ENSG00000267908,ENSG00000267908.2,ENSMUSG00000058028
10125,ZXDA,Zxdb,ENSG00000198205,ENSG00000198205.5,ENSG00000198205,ENSG00000198205.7,ENSMUSG00000073062


In [31]:
part_3 = part_3[part_return.columns[:-2]]
part_3['similarity'] = pd.NA
part_3['homology_type'] = pd.NA
part_return = pd.concat([part_return,part_3]).sort_values(by = ['gene_name_h','similarity'], ascending=[True, False])
part_return = part_return.reset_index(drop=True)

In [32]:
part_return = part_return.drop_duplicates(subset = ['gene_name_h','gene_name_m']).reset_index(drop=True)

In [33]:
l_1 = len(part_return['gene_name_h'].unique())
l_2 = len(part_return.dropna(subset = 'gene_name_m')['gene_name_h'].unique())
l_3 = len(part_return.dropna(subset = ['gene_id_h_37', 'gene_id_m'])['gene_name_h'].unique())
print(f'{l_1} human genes are included in H2M package in total;\n{l_2} of them have one or more mouse orthologs;\n{l_3} of them have available records (v37/v39 for h/m) in Ensembl database via Biomart.')

20487 human genes are included in H2M package in total;
20487 of them have one or more mouse orthologs;
18807 of them have available records (v37/v39 for h/m) in Ensembl database via Biomart.


# Updata built-in dictionaries  

## Homo-dict-human  

In [58]:
def getmouseinfo(list_of_info):
    gene_id_m, gene_name_m, homology_type, similarity = list_of_info
    output_dict = {'gene_name_m':gene_name_m,'gene_id_m':gene_id_m,'homology_type':homology_type,'similarity':similarity}
    return(output_dict)

# dict_2: gene_name_h -> homo information (return a dict)
df = pd.DataFrame(part_return['gene_name_h'].value_counts(sort=False))
count_list = df['count']
name_list = list(df.index)
list_homo_info,i = [],0
for count in count_list:
    sub_df_temp = part_return.iloc[list(range(i,i+count)),:].reset_index(drop=True)
    list_homo_temp = [getmouseinfo(x[[5,6,7,8]]) for _,x in sub_df_temp.iterrows()]
    list_homo_info.append(list_homo_temp)
    i = i + count
homo_dict = dict(zip(list(df.index),list_homo_info))

## Homo-dict-mouse  

In [124]:
homo_dict_mouse = {}

for human_gene, mappings in homo_dict.items():
    for mapping in mappings:
        mouse_gene = mapping['gene_name_m']
        human_info = {
            "gene_name_h": human_gene,
            "homology_type": mapping.get("homology_type", ""),
            "similarity": mapping.get("similarity", 0)
        }
        if mouse_gene in homo_dict_mouse:
            homo_dict_mouse[mouse_gene].append(human_info)
        else:
            homo_dict_mouse[mouse_gene] = [human_info]

## human-name-dict  

In [64]:
# dict_1: gene_id_h -> gene_name_h
name_dict = {}
for i in ['gene_id_h_37','gene_id_h_38']:
    sub_df = df_query[['gene_name_h',i]]
    sub_df = sub_df.drop_duplicates().reset_index(drop=True)
    dict_temp = dict(zip(sub_df.iloc[:,1],sub_df.iloc[:,0]))
    name_dict.update(dict_temp)
del dict_temp

# Add transcript ID information to the database  

In [101]:
import os
json_path = os.path.join('/Users/kexindong/Documents/GitHub/h2m_private/other_codes/v_1_configure_jsons/', 'dict_tx_h.json')
with open(json_path, 'r') as file:
    dict_tx_h_old = json.load(file)

In [66]:
import os
json_path = os.path.join('/Users/kexindong/Documents/GitHub/h2m_private/h2m-package/src/h2m', 'Data', 'dict_tx_h.json')
with open(json_path, 'r') as file:
    dict_tx_h = json.load(file)

json_path = os.path.join('/Users/kexindong/Documents/GitHub/h2m_private/h2m-package/src/h2m', 'Data', 'dict_tx_m.json')
with open(json_path, 'r') as file:
    dict_tx_m = json.load(file)

In [None]:
df_query_mapped = df_query.dropna(subset='homology_type').reset_index(drop=True)
df_query_mapped = df_query_mapped.dropna(subset=['gene_name_h']).reset_index(drop=True)

## Update transcript ID

In [79]:
list_gen_name_h = list(set(part_return['gene_name_h']) - set(dict_tx_h.keys()))
list_gen_name_m = list(set(part_return['gene_name_m']) - set(dict_tx_m.keys()))

In [81]:
list_of_tx = []
for e,x in enumerate(list_gen_name_h):
    print(f'{e}/{len(list_gen_name_h)}\r')
    human_result_37, human_result_38 = None, None
    try:
        human_result_37 = h2m.get_tx_id(x,'h', ver=37, show=False)[3]
    except:
        a = 1
    try:
        human_result_38 = h2m.get_tx_id(x,'h', ver=38, show=False)[3]
    except:
        a = 1
    list_of_tx.append([human_result_37, human_result_38])
dict_tx_h.update(zip(list_gen_name_h, list_of_tx))

list_of_tx = []
for e,x in enumerate(list_gen_name_m):
    print(f'{e}/{len(list_gen_name_h)}\r')
    try:
        result = h2m.get_tx_id(x,'m', show=False)
    except:
        a = 1
    list_of_tx.append(result)
dict_tx_m.update(zip(list_gen_name_m, list_of_tx))

0/1712
1/1712
2/1712
3/1712
4/1712
5/1712
6/1712
7/1712
8/1712
9/1712
10/1712
11/1712
12/1712
13/1712
14/1712
15/1712
16/1712
17/1712
18/1712
19/1712
20/1712
21/1712
22/1712
23/1712
24/1712
25/1712
26/1712
27/1712
28/1712
29/1712
30/1712
31/1712
32/1712
33/1712
34/1712
35/1712
36/1712
37/1712
38/1712
39/1712
40/1712
41/1712
42/1712
43/1712
44/1712
45/1712
46/1712
47/1712
48/1712
49/1712
50/1712
51/1712
52/1712
53/1712
54/1712
55/1712
56/1712
57/1712
58/1712
59/1712
60/1712
61/1712
62/1712
63/1712
64/1712
65/1712
66/1712
67/1712
68/1712
69/1712
70/1712
71/1712
72/1712
73/1712
74/1712
75/1712
76/1712
77/1712
78/1712
79/1712
80/1712
81/1712
82/1712
83/1712
84/1712
85/1712
86/1712
87/1712
88/1712
89/1712
90/1712
91/1712
92/1712
93/1712
94/1712
95/1712
96/1712
97/1712
98/1712
99/1712
100/1712
101/1712
102/1712
103/1712
104/1712
105/1712
106/1712
107/1712
108/1712
109/1712
110/1712
111/1712
112/1712
113/1712
114/1712
115/1712
116/1712
117/1712
118/1712
119/1712
120/1712
121/1712
122/1712
123

In [86]:
list_of_tx[0][3]

'ENSMUST00000055375.6'

In [87]:
list_37 = [x[3] for x in list_of_tx]
dict_tx_m.update(zip(list_gen_name_m, list_37))

In [None]:
list_add = list(dict_tx_h.values())[31015:]
[x[0] for x in list_add]

In [114]:
list_37 = [x[0][3] if x[0] is not None else None for x in list_add]
list_38 = [x[1][3] if x[1] is not None else None for x in list_add]
list_of_tx_new = [[x,y] for x,y in zip(list_37,list_38)]
dict_tx_h.update(zip(list_gen_name_h, list_of_tx_new))

In [None]:
df = pd.DataFrame([name_dict]).transpose().reset_index(drop=False).iloc[:,[1,0]]
df.columns = ['gene_name_h','gene_id_h']
list_gene_name_h = list(df['gene_name_h'].unique())

In [34]:
part_return = part_return.dropna(subset = 'gene_name_h').reset_index(drop=True)
part_return

Unnamed: 0,gene_name_h,gene_id_h_37,gene_id_h_version_37,gene_id_h_38,gene_id_h_version_38,gene_id_m,gene_name_m,homology_type,similarity
0,5S_rRNA,ENSG00000263172,ENSG00000263172.1,ENSG00000277488,ENSG00000277488.1,ENSMUSG00002076113,Gm55778,ortholog_one2many,56.3218
1,5_8S_rRNA,,,ENSG00000278294,ENSG00000278294.1,ENSMUSG00002075795,Gm54867,ortholog_one2many,75.6579
2,7SK,ENSG00000271818,ENSG00000271818.1,ENSG00000202198,ENSG00000202198.1,ENSMUSG00002076400,Gm54707,ortholog_one2many,58.3082
3,7SK,ENSG00000271818,ENSG00000271818.1,ENSG00000271394,ENSG00000271394.1,ENSMUSG00002075014,Gm55081,ortholog_one2one,33.6032
4,A1BG,ENSG00000121410,ENSG00000121410.7,ENSG00000121410,ENSG00000121410.12,ENSMUSG00000022347,A1bg,ortholog_one2one,45.6566
...,...,...,...,...,...,...,...,...,...
31972,ZYX,ENSG00000159840,ENSG00000159840.11,ENSG00000159840,ENSG00000159840.16,ENSMUSG00000029860,Zyx,ortholog_one2one,87.0629
31973,ZZEF1,ENSG00000074755,ENSG00000074755.10,ENSG00000074755,ENSG00000074755.15,ENSMUSG00000055670,Zzef1,ortholog_one2one,90.8815
31974,ZZZ3,ENSG00000036549,ENSG00000036549.8,ENSG00000036549,ENSG00000036549.13,ENSMUSG00000039068,Zzz3,ortholog_one2one,87.2647
31975,hsa-mir-423,,,ENSG00000266919,ENSG00000266919.3,ENSMUSG00000065518,Mir423,ortholog_one2many,28.7234


In [119]:
part_return['tx_id_canonical_h_37'] = None
part_return['tx_id_canonical_h_38'] = None
part_return['tx_id_canonical_m'] = None

part_return['tx_id_canonical_h_37'] = [dict_tx_h.get(x, [None])[0] for x in part_return['gene_name_h']]
part_return['tx_id_canonical_h_38'] = [dict_tx_h.get(x, [None,None])[1] for x in part_return['gene_name_h']]
part_return['tx_id_canonical_m'] = [dict_tx_m.get(x, None) for x in part_return['gene_name_m']]
part_return = part_return[['gene_name_h','gene_id_h_37','gene_id_h_version_37','tx_id_canonical_h_37',
                           'gene_id_h_38','gene_id_h_version_38','tx_id_canonical_h_38',
                           'gene_name_m','gene_id_m','tx_id_canonical_m',
                           'homology_type','similarity']]
part_return

Unnamed: 0,gene_name_h,gene_id_h_37,gene_id_h_version_37,tx_id_canonical_h_37,gene_id_h_38,gene_id_h_version_38,tx_id_canonical_h_38,gene_name_m,gene_id_m,tx_id_canonical_m,homology_type,similarity
0,5S_rRNA,ENSG00000263172,ENSG00000263172.1,ENST00000364415.1,ENSG00000277488,ENSG00000277488.1,,Gm55778,ENSMUSG00002076113,ENSMUST00020182636.1,ortholog_one2many,56.3218
1,5_8S_rRNA,,,,ENSG00000278294,ENSG00000278294.1,,Gm54867,ENSMUSG00002075795,ENSMUST00020182528.1,ortholog_one2many,75.6579
2,7SK,ENSG00000271818,ENSG00000271818.1,ENST00000606815.1,ENSG00000202198,ENSG00000202198.1,ENST00000365328.1,Gm54707,ENSMUSG00002076400,ENSMUST00020182185.1,ortholog_one2many,58.3082
3,7SK,ENSG00000271818,ENSG00000271818.1,ENST00000606815.1,ENSG00000271394,ENSG00000271394.1,ENST00000365328.1,Gm55081,ENSMUSG00002075014,ENSMUST00020182634.1,ortholog_one2one,33.6032
4,A1BG,ENSG00000121410,ENSG00000121410.7,ENST00000263100.3,ENSG00000121410,ENSG00000121410.12,ENST00000263100.8,A1bg,ENSMUSG00000022347,ENSMUST00000096418.5,ortholog_one2one,45.6566
...,...,...,...,...,...,...,...,...,...,...,...,...
31972,ZYX,ENSG00000159840,ENSG00000159840.11,ENST00000322764.5,ENSG00000159840,ENSG00000159840.16,ENST00000322764.10,Zyx,ENSMUSG00000029860,ENSMUST00000164375.4,ortholog_one2one,87.0629
31973,ZZEF1,ENSG00000074755,ENSG00000074755.10,ENST00000381638.2,ENSG00000074755,ENSG00000074755.15,ENST00000381638.7,Zzef1,ENSMUSG00000055670,ENSMUST00000207107.2,ortholog_one2one,90.8815
31974,ZZZ3,ENSG00000036549,ENSG00000036549.8,ENST00000370801.3,ENSG00000036549,ENSG00000036549.13,ENST00000370801.8,Zzz3,ENSMUSG00000039068,ENSMUST00000106100.9,ortholog_one2one,87.2647
31975,hsa-mir-423,,,,ENSG00000266919,ENSG00000266919.3,ENST00000586878.1,Mir423,ENSMUSG00000065518,ENSMUST00000083584.4,ortholog_one2many,28.7234


In [35]:
part_return['tx_id_canonical_h_37'] = None
part_return['tx_id_canonical_h_38'] = None
part_return['tx_id_canonical_m'] = None

part_return['tx_id_canonical_h_37'] = [h2m.dict_tx_h.get(x, [None])[0] for x in part_return['gene_name_h']]
part_return['tx_id_canonical_h_38'] = [h2m.dict_tx_h.get(x, [None,None])[1] for x in part_return['gene_name_h']]
part_return['tx_id_canonical_m'] = [h2m.dict_tx_m.get(x, None) for x in part_return['gene_name_m']]
part_return = part_return[['gene_name_h','gene_id_h_37','gene_id_h_version_37','tx_id_canonical_h_37',
                           'gene_id_h_38','gene_id_h_version_38','tx_id_canonical_h_38',
                           'gene_name_m','gene_id_m','tx_id_canonical_m',
                           'homology_type','similarity']]
part_return

Unnamed: 0,gene_name_h,gene_id_h_37,gene_id_h_version_37,tx_id_canonical_h_37,gene_id_h_38,gene_id_h_version_38,tx_id_canonical_h_38,gene_name_m,gene_id_m,tx_id_canonical_m,homology_type,similarity
0,5S_rRNA,ENSG00000263172,ENSG00000263172.1,ENST00000364415.1,ENSG00000277488,ENSG00000277488.1,,Gm55778,ENSMUSG00002076113,ENSMUST00020182636.1,ortholog_one2many,56.3218
1,5_8S_rRNA,,,,ENSG00000278294,ENSG00000278294.1,,Gm54867,ENSMUSG00002075795,ENSMUST00020182528.1,ortholog_one2many,75.6579
2,7SK,ENSG00000271818,ENSG00000271818.1,ENST00000606815.1,ENSG00000202198,ENSG00000202198.1,ENST00000365328.1,Gm54707,ENSMUSG00002076400,ENSMUST00020182185.1,ortholog_one2many,58.3082
3,7SK,ENSG00000271818,ENSG00000271818.1,ENST00000606815.1,ENSG00000271394,ENSG00000271394.1,ENST00000365328.1,Gm55081,ENSMUSG00002075014,ENSMUST00020182634.1,ortholog_one2one,33.6032
4,A1BG,ENSG00000121410,ENSG00000121410.7,ENST00000263100.3,ENSG00000121410,ENSG00000121410.12,ENST00000263100.8,A1bg,ENSMUSG00000022347,ENSMUST00000096418.5,ortholog_one2one,45.6566
...,...,...,...,...,...,...,...,...,...,...,...,...
31972,ZYX,ENSG00000159840,ENSG00000159840.11,ENST00000322764.5,ENSG00000159840,ENSG00000159840.16,ENST00000322764.10,Zyx,ENSMUSG00000029860,ENSMUST00000164375.4,ortholog_one2one,87.0629
31973,ZZEF1,ENSG00000074755,ENSG00000074755.10,ENST00000381638.2,ENSG00000074755,ENSG00000074755.15,ENST00000381638.7,Zzef1,ENSMUSG00000055670,ENSMUST00000207107.2,ortholog_one2one,90.8815
31974,ZZZ3,ENSG00000036549,ENSG00000036549.8,ENST00000370801.3,ENSG00000036549,ENSG00000036549.13,ENST00000370801.8,Zzz3,ENSMUSG00000039068,ENSMUST00000106100.9,ortholog_one2one,87.2647
31975,hsa-mir-423,,,,ENSG00000266919,ENSG00000266919.3,ENST00000586878.1,Mir423,ENSMUSG00000065518,ENSMUST00000083584.4,ortholog_one2many,28.7234


In [36]:
part_return.to_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/Supp_Table_1_Homo_Genes.csv',index=False)

## Save dictionaries

In [126]:
# save (write file)
with open('h2m-package/src/h2m/Data/human_name_dict.json','w') as json_file:
    json.dump(name_dict, json_file)

with open('h2m-package/src/h2m/Data/homo_dict.json','w') as json_file:
    json.dump(homo_dict, json_file)

with open('h2m-package/src/h2m/Data/dict_tx_m.json','w') as json_file:
    json.dump(dict_tx_m, json_file)

with open('h2m-package/src/h2m/Data/dict_tx_h.json','w') as json_file:
    json.dump(dict_tx_h, json_file)

with open('h2m-package/src/h2m/Data/homo_dict_mouse.json','w') as json_file:
    json.dump(homo_dict_mouse, json_file)