In [1]:
import os
import pandas as pd
import requests, sys
import numpy as np
from datetime import datetime
from collections import defaultdict
from zipfile import ZipFile
import gzip
import xmltodict
import lxml
from lxml import etree
from io import StringIO

In [21]:
def check_ensembl_symbol_helper(gene_id):
    print('checking ens symbol for ', gene_id)
    server = "https://rest.ensembl.org"
    ext = ''.join(['/xrefs/id/', gene_id, '?'])

    r = requests.get(server + ext, headers={"Content-Type": "application/json"})

    if not r.ok:
        r.raise_for_status()
        return None

    decoded = r.json()

    symbol_tmp = decoded[0]['display_id']

    if symbol_tmp is not None and symbol_tmp != gene_id:
        return symbol_tmp
    else:
        return None


def from_ens_2_entrez(gene_id):
    print('checking ncbi entrez id and symbol for ', gene_id)
    target_url = 'https://www.ncbi.nlm.nih.gov/gene/?term=' + gene_id
    response = requests.get(target_url)
    dict_data = xmltodict.parse(response.content)
    package = dict_data['html']['body']['div']['div']['form']['div'][0]['div'][3]['div']['div'][5]
    entrez_id = package['div'][0]['div'][0]['span']['#text'].split(' ')[2][:-1]
    name = package['div'][1]['div'][0]['div']['div']['dl']['dd'][0]['#text']
    return entrez_id, name


def get_human_orth_ncbi(gene_id):
    print('checking ncbi human orthlogus for ', gene_id)
    target_url = 'https://www.ncbi.nlm.nih.gov/gene/?term=' + gene_id
    response = requests.get(target_url)
    dict_data = xmltodict.parse(response.content)
    package = dict_data['html']['body']['div']['div']['form']['div'][0]['div'][3]['div']['div'][5]
    for item in package['div'][1]['div'][0]['div']['div']['dl']['dd'][7]['a']:
        if item['#text'] == 'human':
            human_entrez_id = item['@href'].split('/')[-1]
    return human_entrez_id


def parse_human_url(human_entrez_id):
    print('parsing ncbi human orthlogus for', human_entrez_id)
    target_url = 'https://www.ncbi.nlm.nih.gov/gene/' + human_entrez_id
    response = requests.get(target_url)
    dict_data = xmltodict.parse(response.content)
    package = dict_data['html']['body']['div']['div']['form']['div'][0]['div'][3]['div']['div'][5]['div'][1]['div'][0]

    human_symbol = package['div']['div']['dl']['dd'][0]['#text']
    hgnc = package['div']['div']['dl']['dd'][2]['a']['#text']
    ens_id = package['div']['div']['dl']['dd'][3]['a'][0]['#text']
    if len(ens_id.split(':')) == 2:
        ens_id = ens_id.split(':')[1]

    return human_symbol, hgnc, ens_id


def it_child(node):
    for element in node.iterchildren():
        if element.tag == 'a' and 'href' in element.attrib and element.attrib['href'].startswith('http://www.ncbi.nlm.nih.gov/entrez'):
            return element.text
        it_child(element)
        
def from_ens_2_entrez_use_enblorg(tmp_gene_id):
    target_url = 'https://uswest.ensembl.org/Gene/Summary?g=' + tmp_gene_id
    parser = etree.HTMLParser(recover=True)
    page = requests.get(target_url)
    html = page.content.decode("utf-8")
    tree = etree.parse(StringIO(html), parser=parser)
    root = tree.getroot()
    return it_child(root)


def check_human_gene_id(gene_id):
    server = "https://rest.ensembl.org"
    ext = "/homology/id/" + gene_id + "?type=orthologues;format=condensed"
    r = requests.get(server + ext, headers={"Content-Type": "application/json"})

    if not r.ok:
        r.raise_for_status()

    decoded = r.json()
    return decoded


def check_ensembl_symbol_helper(gene_id):
    print('checking ensembl symbol for', gene_id)
    try:
        server = "https://rest.ensembl.org"
        ext = ''.join(['/xrefs/id/', gene_id, '?'])

        r = requests.get(server + ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()

        symbol_tmp = decoded[0]['display_id']

        if symbol_tmp is not None and symbol_tmp != gene_id:
            return symbol_tmp
        else:
            return None
    except Exception:
        return None
def from_hgncid_to_hgncsymb(hgnc_id):
    server = 'https://rest.genenames.org/'
    ext = 'search/hgnc_id/' + str(hgnc_id.split(":")[-1])
    
    r = requests.get(server + ext)
    
    if not r.ok:
        r.raise_for_status()

#     decoded = r.content.decode()
    dict_data = xmltodict.parse(r.content)
    return dict_data['response']['result']['doc']['str'][1]['#text']
    
def human_from_entrez_to_ens(returned_entrez_id):
    server = 'https://www.ncbi.nlm.nih.gov/'
    ext = 'gene/' + str(returned_entrez_id)

#     parser = etree.HTMLParser(recover=True)
    page = requests.get(server + ext)
    
    if not page.ok:
        page.raise_for_status()
    dict_data = xmltodict.parse(page.content)
    package = dict_data['html']['body']['div']['div']['form']['div'][0]['div'][3]['div']['div'][5]
    returned_ensid = package['div'][1]['div'][0]['div']['div']['dl']['dd'][3]['a'][0]['#text']
    
    return returned_ensid.split(':')[-1]

def it_child2(node, found):
    if node == None:
        return 
    if len(found) == 0:
        for element in node.iterchildren():
            if 'data-section' in element.attrib and element.attrib['data-section'] == 'Featured':
                found.append(element.attrib['data-item-id'])
            it_child2(element, found)
    else:
        return found
    return found
        
def human_from_symbol_2entrez(human_symbol):
    server = 'https://www.ncbi.nlm.nih.gov/'
    ext = 'search/all/?term=' + 'TDH'
    parser = etree.HTMLParser(recover=True)
    page = requests.get(server + ext, headers={"Content-Type": "application/json"})

    if not page.ok:
        page.raise_for_status()

    html = page.content.decode("utf-8")
    tree = etree.parse(StringIO(html), parser=parser)
    root = tree.getroot()
     
    returned_entrez_id = None
    returned_ens_id = None
    
    try:
        found = []
        returned_entrez_id = it_child2(root,found)[0].split(':')[-1]
    except Exception:
        pass
    
    try:
        returned_ens_id = human_from_entrez_to_ens(returned_entrez_id)
    except Exception:
        pass
    
    return returned_entrez_id, returned_ens_id 

In [107]:
test = pd.read_csv('/Users/liulihe95/EnrichKitDB/data/tmp/id_mapper/id_mapper_sus.txt')
test2 = test.iloc[32200:32215,]
test2 = test2.astype(str)
test2.dtypes
# test2.dtypes()

species            object
gene_id            object
ensembl_symbol     object
entrez_id          object
ncbi_symbol        object
vgnc_id            object
vgnc_symbol        object
hgnc_orthologs     object
human_gene_id      object
human_entrez_id    object
hgnc_symbol        object
dtype: object

In [108]:
id_mapper_final = test2
id_mapper_final

Unnamed: 0,species,gene_id,ensembl_symbol,entrez_id,ncbi_symbol,vgnc_id,vgnc_symbol,hgnc_orthologs,human_gene_id,human_entrez_id,hgnc_symbol
32200,5,ENSSSCG00000037847,MOB2,110259212.0,MOB2,VGNC:90291,MOB2,HGNC:24904,ENSG00000182208,81532.0,MOB2
32201,5,ENSSSCG00000037848,,110255232.0,LOC110255232,,,,,,
32202,5,ENSSSCG00000037854,L3HYPDH,110262303.0,L3HYPDH,VGNC:89612,L3HYPDH,HGNC:20488,ENSG00000126790,112849.0,L3HYPDH
32203,5,ENSSSCG00000037856,EDN2,102157561.0,EDN2,VGNC:87548,EDN2,HGNC:3177,ENSG00000127129,1907.0,EDN2
32204,5,ENSSSCG00000037858,,100518500.0,LOC100518500,,,,,,
32205,5,ENSSSCG00000037865,SOX18,100049667.0,SOX18,VGNC:95510,SOX18,HGNC:11194,ENSG00000203883,54345.0,SOX18
32206,5,ENSSSCG00000037866,MAGEA10,110257696.0,MAGEA10,,,,,,
32207,5,ENSSSCG00000037868,NPBWR2,110257396.0,NPBWR2,,,,,,
32208,5,ENSSSCG00000037869,NDUFA5,100525371.0,NDUFA5,VGNC:110309,NDUFA5,HGNC:7688,ENSG00000128609,4698.0,NDUFA5
32209,5,ENSSSCG00000037873,,110258905.0,LOC110258905,,,,,,


In [109]:
out = []
for index, row in id_mapper_final.iterrows():
# row = id_mapper_final.iloc[9]
    if row['gene_id'] != 'nan':
        if row['ensembl_symbol'] == 'nan':
            # check ensembl_symbol
            try:
                row['ensembl_symbol'] = check_ensembl_symbol_helper(row['gene_id'])

            except Exception:
                pass

        # check entrez id
        if row['entrez_id'] == 'nan':
            returned_entrez_id = None
            returned_entrez_symbol = None

            # ens 2 entrez: trust ncbi
            try:
                returned_entrez_id, returned_entrez_symbol = from_ens_2_entrez(row['gene_id'])

            except Exception:
                pass

            if returned_entrez_id == None or returned_entrez_symbol == None:

                # ens 2 entrez: trust ensembl
                try:
                    returned_entrez_id = from_ens_2_entrez_use_enblorg(row['gene_id'] )
                    _, returned_entrez_symbol = from_ens_2_entrez(returned_entrez_id)

                except Exception:
                    pass

            if not returned_entrez_id is None and not returned_entrez_symbol is None:
                row['entrez_id'] = returned_entrez_id
                row['ncbi_symbol'] = returned_entrez_symbol

    # check human orthologus
    if row['entrez_id'] != 'nan' and row['human_gene_id'] == 'nan':

        try:
            human_entrez_id = get_human_orth_ncbi(str(int(float(row['entrez_id']))))

            row['human_entrez_id'] = human_entrez_id

            human_symbol, hgnc, ens_id = parse_human_url(str(human_entrez_id))

            row['hgnc_orthologs'] = hgnc

            row['human_gene_id'] = ens_id

            row['hgnc_symbol'] = human_symbol

        except Exception:
            pass

    # check hgnc id and hgnc symbol
    if row['hgnc_orthologs'] != 'nan' and row['hgnc_symbol'] == 'nan':
        try:
            row['hgnc_symbol'] = from_hgncid_to_hgncsymb(row['hgnc_orthologs'])
        except Exception:
            pass

    if row['hgnc_symbol'] != 'nan' and (row['human_gene_id'] == 'nan' or row['human_entrez_id'] == 'nan'):

        returned_entrez_id_human, returned_ens_id_human = human_from_symbol_2entrez(row['hgnc_symbol'])

        if not returned_entrez_id_human is None:
            row['human_entrez_id'] = returned_entrez_id_human
        else:
            pass

        if not returned_ens_id_human is None:
            row['human_gene_id'] = returned_ens_id_human
        else:
            pass
# pd.DataFrame(row).T
# out.append(row)

# pd.DataFrame(out)

# test = pd.DataFrame(row)
# test.to_csv(
#     os.path.join('test',''.join([str(index),'.txt'])
#     ), index=False, encoding='utf-8')

#     id_mapper_final.to_csv(''.join(['new_', data_file_path]), index=False, encoding='utf-8')


checking ensembl symbol for ENSSSCG00000037848
checking ncbi human orthlogus for  110255232
checking ensembl symbol for ENSSSCG00000037858
checking ncbi human orthlogus for  100518500
checking ncbi human orthlogus for  110257696
parsing ncbi human orthlogus for 4109
checking ncbi human orthlogus for  110257396
checking ensembl symbol for ENSSSCG00000037873
checking ncbi human orthlogus for  110258905
checking ncbi human orthlogus for  100515603
checking ensembl symbol for ENSSSCG00000037877
checking ncbi human orthlogus for  110257991
checking ncbi human orthlogus for  104796094


In [110]:
2190 * 150

328500

In [106]:
id_mapper_final

Unnamed: 0,species,gene_id,ensembl_symbol,entrez_id,ncbi_symbol,vgnc_id,vgnc_symbol,hgnc_orthologs,human_gene_id,human_entrez_id,hgnc_symbol
32200,3,ENSGALG00010004356,TRAPPC8,421095.0,TRAPPC8,,,,,,
32201,3,ENSGALG00010004358,GC,395696.0,GC,,,,,,
32202,3,ENSGALG00010004359,APOV1,396476.0,APOV1,,,,,,
32203,3,ENSGALG00010004361,NPFFR2,428759.0,NPFFR2,,,,,,
32204,3,ENSGALG00010004362,FBXL4,421798.0,FBXL4,,,,,,
32205,3,ENSGALG00010004364,TSHR,428900.0,TSHR,,,,,,
32206,3,ENSGALG00010004365,TPD52,770339.0,TPD52,,,,,,
32207,3,ENSGALG00010004366,UNC5C,395101.0,UNC5C,,,,,,
32208,3,ENSGALG00010004367,TSPAN13,420595.0,TSPAN13,,,,,,
32209,3,ENSGALG00010004368,GJD4,420465.0,GJD4,,,,,,


In [10]:
# file_list = os.listdir('data/tmp/id_mapper/')
# out = []
# for item in file_list:
#     if item.startswith('id_mapper'):
#         print(item)
#         tmp_df = pd.read_csv(os.path.join('data/tmp/id_mapper/',item))
#         out.append(tmp_df)

# all_id_mapper = pd.concat(out)
# all_id_mapper = all_id_mapper.astype(str)
# all_id_mapper_filter = all_id_mapper[all_id_mapper['gene_id'] != 'nan']
# all_id_mapper_filter.reset_index(drop=True, inplace=True)
# all_id_mapper_filter

In [11]:
# file_list = os.listdir('data/tmp/id_mapper/')
# out = []
# for item in file_list:
#     if item.startswith('id_mapper'):
#         print(item)
#         tmp_df = pd.read_csv(os.path.join('data/tmp/id_mapper/',item))
#         out.append(tmp_df)

# all_id_mapper = pd.concat(out)
# all_id_mapper = all_id_mapper.astype(str)
# all_id_mapper_filter = all_id_mapper[all_id_mapper['gene_id'] != 'nan']
# all_id_mapper_filter.reset_index(drop=True, inplace=True)
# all_id_mapper_filter

# # count = 0
# for index, row in all_id_mapper_filter.iterrows():
#     if index % 2000 == 0:
#         print('mark - ', index)
# #     print(index)
# #     count += 1
# #     if count < 10:
#     tmp_row_df = row.to_frame().T
#     tmp_row_df.to_csv(os.path.join('id_raw',''.join([str(index),'.txt'])), index=False, encoding='utf-8')
# #     else:
# #         break

checking ens symbol for  ENSOARG00000000038
checking ens symbol for  ENSOARG00000000039
checking ncbi entrez id and symbol for  ENSOARG00000000039
checking ens symbol for  ENSOARG00000000041
checking ncbi entrez id and symbol for  ENSOARG00000000041
checking ens symbol for  ENSOARG00000000044
checking ncbi entrez id and symbol for  ENSOARG00000000044
checking ens symbol for  ENSOARG00000000045
checking ncbi entrez id and symbol for  ENSOARG00000000045
checking ens symbol for  ENSOARG00000000046
checking ens symbol for  ENSOARG00000000048
checking ens symbol for  ENSOARG00000000049
checking ncbi entrez id and symbol for  ENSOARG00000000049
checking ens symbol for  ENSOARG00000000050
checking ncbi entrez id and symbol for  ENSOARG00000000050
checking ens symbol for  ENSOARG00000000051
checking ncbi entrez id and symbol for  ENSOARG00000000051


In [9]:
id_mapper_final

Unnamed: 0,gene_id,ensembl_symbol,entrez_id,ncbi_symbol,vgnc_id,vgnc_symbol,hgnc_orthologs,human_gene_id,human_entrez_id,hgnc_symbol
0,ENSOARG00000000038,FIBP,101110717.0,FIBP,,,HGNC:HGNC:3705,ENSG00000172500,9158.0,FIBP
1,ENSOARG00000000039,LOC101104933,,,,,,,,
2,ENSOARG00000000041,,,,,,,,,
3,ENSOARG00000000044,IL2,,,,,,,,
4,ENSOARG00000000045,LOC101105186,,,,,,,,
5,ENSOARG00000000046,LHX1,101109722.0,LHX1,,,HGNC:HGNC:6593,ENSG00000273706,3975.0,LHX1
6,ENSOARG00000000048,PIGS,101120255.0,PIGS,,,HGNC:HGNC:14937,ENSG00000087111,94005.0,PIGS
7,ENSOARG00000000049,,,,,,,,,
8,ENSOARG00000000050,LOC101105686,,,,,,,,
9,ENSOARG00000000051,,,,,,,,,


In [20]:

server = "https://rest.ensembl.org"
ext = "/lookup/id/ENSCHIG00000000043?"
 
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
decoded = r.json()
decoded

{'end': 29502,
 'canonical_transcript': 'ENSCHIT00000000044.1',
 'biotype': 'protein_coding',
 'seq_region_name': 'LWLT01000149.1',
 'logic_name': 'ensembl',
 'db_type': 'core',
 'strand': 1,
 'id': 'ENSCHIG00000000043',
 'version': 1,
 'assembly_name': 'ARS1',
 'object_type': 'Gene',
 'species': 'capra_hircus',
 'source': 'ensembl',
 'start': 20721}

In [24]:
def check_ens_symbol(ens_id):
    server = "https://rest.ensembl.org"
    ext = "/lookup/id/" + ens_id + "?"
    r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
    if r.ok:
        decoded = r.json()
#         r.raise_for_status()
#         sys.exit()
    if 'display_name' in decoded:
        return decoded['display_name']
    else:
        return None

In [29]:
t = check_ens_symbol('ENSOARG00000008127')
t is None

True

In [19]:
https://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&list_uids=101110717

SyntaxError: invalid syntax (<ipython-input-19-94872ea8bea4>, line 1)

In [39]:
# bta - 'display_name'
# cap - 'display_name'
# sus - 'display_name'
# ovi - 'display_name'



# decoded
# for item in decoded:
#     print(item)

{'canonical_transcript': 'ENSCHIT00000000525.1',
 'source': 'ensembl',
 'object_type': 'Gene',
 'display_name': 'SH2D4B',
 'logic_name': 'ensembl',
 'version': 1,
 'description': 'SH2 domain containing 4B [Source:HGNC Symbol;Acc:HGNC:31440]',
 'seq_region_name': '28',
 'start': 9842112,
 'end': 9922991,
 'assembly_name': 'ARS1',
 'strand': -1,
 'id': 'ENSCHIG00000000351',
 'biotype': 'protein_coding',
 'species': 'capra_hircus',
 'db_type': 'core'}

In [74]:
# r = requests.get('https://www.ncbi.nlm.nih.gov/gene/?term=ENSOARG00000019090', headers={ "Content-Type" : "application/json"})
target_url = 'https://www.ncbi.nlm.nih.gov/gene/?term=' + 'ENSOARG00000008832'
response = requests.get(target_url)
dict_data = xmltodict.parse(response.content)
package = dict_data['html']['body']['div']['div']['form']['div'][0]['div'][3]['div']['div'][5]
entrez_id = package['div'][0]['div'][0]['span']['#text'].split(' ')[2][:-1]
name = package['div'][1]['div'][0]['div']['div']['dl']['dd'][0]['#text']
print('entrez_id:',entrez_id.split(' ')[0], 'gene symbol: ', name.split(' ')[0])

entrez_id: 101105454 gene symbol:  SIRT3


In [None]:
'http://uswest.ensembl.org/Ovis_aries/Gene/Summary?g=ENSOARG00000000038;r=21:43305834-43310013;t=ENSOART00000000039'

In [294]:
# target_url = 'https://uswest.ensembl.org/Multi/Search/Results?q=ENSOARG00000000038;site=ensembl'
target_url = 'https://uswest.ensembl.org/Gene/Summary?g=ENSOARG00000000041'
# target_url = 'http://uswest.ensembl.org/Ovis_aries/Gene/Summary?g=ENSOARG00000000038;r=21:43305834-43310013;t=ENSOART00000000039'
# response = requests.get(target_url)
# dir(response)
# for line in response.iter_lines():
#     print(line)
# t = response.content.decode()
# t
# dict_data = xmltodict.parse(response.text)
# response.text.split('\n')

page = requests.get(target_url)
# page.content.decode()
# parser = etree.HTMLParser(recover=True)

# html = etree.HTML(page.content, parser)


# Decode the page content from bytes to string
html = page.content.decode("utf-8")

# Create your etree with a StringIO object which functions similarly
# to a fileHandler
tree = etree.parse(StringIO(html), parser=parser)
root = tree.getroot()
root
# # Call this function and pass in your tree
# def get_links(tree):
#     # This will get the anchor tags <a href...>
#     refs = tree.xpath("//a")
#     # Get the url from the ref
#     links = [link.get('href', '') for link in refs]
#     # Return a list that only ends with .com.br
#     return [l for l in links if l.endswith('.com.br')]


# # Example call
# links = get_links(tree)
def it_child(node):
    for element in node.iterchildren():
        if element.tag == 'a' and 'href' in element.attrib and element.attrib['href'].startswith('http://www.ncbi.nlm.nih.gov/entrez'): 
            print(element.text)
        it_child(element)
        
it_child(root)
    

<Element html at 0x7fe6e087a5f0>