# Usage of UniRef search (Including ID Mapping)

In [1]:
import os
import requests
import re
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd
import pprint

In [2]:
from locus_search.id_mapping_tools import *
from locus_search.UniRef_search_tools import *

In [None]:
%cd ..

## Ensembl

In [None]:
"""
query = 'P74258'
tag_GeneID, in_NCBI, tag_Ensembl, in_Ensembl = check_external_links(query)
if (in_NCBI):   # False
    df_output_NCBI = NCBI_pipeline(tag_GeneID, scope=5, update=False)
if (in_Ensembl):    # True
    df_output_Ensembl = Ensembl_pipeline(tag_Ensembl, scope=5, update=False)
"""

In [4]:
df_Ensembl = pd.read_csv('outputs/gene_table_ensembl/gt_synechocystis_sp_pcc_6803_gca_000009725_Chromosome.csv', header=0)
strand = df_Ensembl[df_Ensembl['gene_id'] == 'BAA18352']['strand'].item()
df_Ensembl_filtered = df_Ensembl[df_Ensembl['strand'] == strand].reset_index()
idx = df_Ensembl_filtered[df_Ensembl_filtered['gene_id'] == 'BAA18352'].index.item()
df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))]   # same as df_outputs_Ensembl

Unnamed: 0,index,gene_id,start,end,strand,description
856,1759,BAA18339,1934307,1935386,1,43 kD bacteriochlorophyll synthase subunit
857,1765,BAA18345,1939865,1940506,1,
858,1766,BAA18346,1940549,1942087,1,sodium-coupled permease
859,1767,BAA18347,1942234,1942701,1,
860,1771,BAA18351,1945518,1947179,1,glycerol-3-phosphate dehydrogenase
861,1772,BAA18352,1947325,1948824,1,"alpha,alpha-trehalose-phosphate synthase"
862,1783,BAA18363,1959436,1961079,1,alpha-isopropylmalate synthase
863,1787,BAA18367,1962705,1965524,1,
864,1788,BAA18368,1965485,1965949,1,
865,1789,BAA18369,1966146,1969118,1,delta-1-pyrroline-5-carboxylate dehydrogenase


In [5]:
query_list = df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))]['gene_id'].to_list()

In [6]:
query_list

['BAA18339',
 'BAA18345',
 'BAA18346',
 'BAA18347',
 'BAA18351',
 'BAA18352',
 'BAA18363',
 'BAA18367',
 'BAA18368',
 'BAA18369',
 'BAA18372']

ID Mapping

In [6]:
UniProtKB_accession_list = get_UniProtKB_accession(query_list, 'Ensembl_Genomes')

Fetched: 11 / 11


In [7]:
UniProtKB_accession_list

['Q55087',
 'P74251',
 'P74252',
 'P74253',
 'P74257',
 'P74258',
 'P74269',
 'P74273',
 'P74274',
 'P74275',
 'P74278']

In [8]:
cluster_name_list =  UniRef_pipeline(UniProtKB_accession_list, identity=0.5, update=False)

- identity : A sequence identity threshold in an UniRef cluster. (one of [0.5, 0.9, 1.0], default = 0.5)

- update : Whether to search again for previously searched items. (bool, default = False)

In [9]:
cluster_name_list

['Geranylgeranyl diphosphate reductase',
 'Signal peptide protein',
 'Sodium-coupled permease',
 'CHRD domain-containing protein',
 'Glycerol-3-phosphate dehydrogenase',
 'Glucosylglycerol-phosphate synthase',
 '(R)-citramalate synthase',
 'ParA family protein',
 'Doublecortin domain-containing protein',
 'L-glutamate gamma-semialdehyde dehydrogenase',
 'Putative transposase for insertion sequence element IS4SA']

In [10]:
df_Ensembl_filtered_added_UniRef50 = df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))].copy()
df_Ensembl_filtered_added_UniRef50['UniProtKB accession'] = UniProtKB_accession_list
df_Ensembl_filtered_added_UniRef50['UniRef50 cluster'] = cluster_name_list

In [11]:
df_Ensembl_filtered_added_UniRef50

Unnamed: 0,index,gene_id,start,end,strand,description,UniProtKB accession,UniRef50 cluster
856,1759,BAA18339,1934307,1935386,1,43 kD bacteriochlorophyll synthase subunit,Q55087,Geranylgeranyl diphosphate reductase
857,1765,BAA18345,1939865,1940506,1,,P74251,Signal peptide protein
858,1766,BAA18346,1940549,1942087,1,sodium-coupled permease,P74252,Sodium-coupled permease
859,1767,BAA18347,1942234,1942701,1,,P74253,CHRD domain-containing protein
860,1771,BAA18351,1945518,1947179,1,glycerol-3-phosphate dehydrogenase,P74257,Glycerol-3-phosphate dehydrogenase
861,1772,BAA18352,1947325,1948824,1,"alpha,alpha-trehalose-phosphate synthase",P74258,Glucosylglycerol-phosphate synthase
862,1783,BAA18363,1959436,1961079,1,alpha-isopropylmalate synthase,P74269,(R)-citramalate synthase
863,1787,BAA18367,1962705,1965524,1,,P74273,ParA family protein
864,1788,BAA18368,1965485,1965949,1,,P74274,Doublecortin domain-containing protein
865,1789,BAA18369,1966146,1969118,1,delta-1-pyrroline-5-carboxylate dehydrogenase,P74275,L-glutamate gamma-semialdehyde dehydrogenase


## NCBI

In [None]:
"""
query = 'P12345'
tag_GeneID, in_NCBI, tag_Ensembl, in_Ensembl = check_external_links(query)
if (in_NCBI):   # False
    df_output_NCBI = NCBI_pipeline(tag_GeneID, scope=5, update=False)
if (in_Ensembl):    # True
    df_output_Ensembl = Ensembl_pipeline(tag_Ensembl, scope=5, update=False)
"""

In [13]:
df_NCBI = pd.read_table('outputs/gene_table_ncbi/gt_NC_067378_1.tsv', header=0)
df_NCBI_filtered = df_NCBI[df_NCBI['protein_coding'] == 1].reset_index(drop=True)
idx = df_NCBI_filtered[df_NCBI_filtered['GeneID'] == '100348732'].index.item()
df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))] # same as df_outputs_NCBI

Unnamed: 0,start,end,gene_name,GeneID,description,protein_coding
181,14326371,14314551,LOC100338732,100338732,GINS complex subunit 3,1
182,14408555,14369895,LOC108176730,108176730,NDRG family member 4,1
183,14416308,14409348,LOC100338978,100338978,"SET domain containing 6, protein lysine methyl...",1
184,14413471,14502663,LOC100348223,100348223,CCR4-NOT transcription complex subunit 1,1
185,14530282,14549184,LOC100348474,100348474,solute carrier family 38 member 7,1
186,14555621,14577537,LOC100348732,100348732,glutamic-oxaloacetic transaminase 2,1
187,17294490,17712444,LOC100348982,100348982,cadherin 8,1
188,18430070,18431011,NPM1,100328693,"nucleophosmin (nucleolar phosphoprotein B23, n...",1
189,19327295,19341992,LOC127491314,127491314,uncharacterized LOC127491314,1
190,19791811,19791344,LOC100340229,100340229,U1 small nuclear ribonucleoprotein C-like,1


In [14]:
query_list = df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))]['GeneID'].to_list()

In [15]:
query_list

['100338732',
 '108176730',
 '100338978',
 '100348223',
 '100348474',
 '100348732',
 '100348982',
 '100328693',
 '127491314',
 '100340229',
 '100349487']

In [16]:
UniProtKB_accession_list = get_UniProtKB_accession(query_list, 'GeneID')

Fetched: 6 / 6


In [17]:
UniProtKB_accession_list

[None,
 None,
 None,
 'G1SRZ8',
 'G1SKK6',
 'P12345',
 'A0A5F9DGY6',
 'B7NZF9',
 None,
 None,
 'G1TCU4']

In [18]:
cluster_name_list =  UniRef_pipeline(UniProtKB_accession_list, identity=0.5, update=False)

In [19]:
cluster_name_list

[None,
 None,
 None,
 'CCR4-NOT transcription complex subunit 1',
 'Sodium-coupled neutral amino acid transporter 7',
 'Aspartate aminotransferase, mitochondrial',
 'Cadherin 8',
 'Nucleophosmin',
 None,
 None,
 'Cadherin-11']

In [20]:
df_NCBI_filtered_added_UniRef50 = df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))].copy()
df_NCBI_filtered_added_UniRef50['UniProtKB accession'] = UniProtKB_accession_list
df_NCBI_filtered_added_UniRef50['UniRef50 cluster'] = cluster_name_list

In [21]:
df_NCBI_filtered_added_UniRef50

Unnamed: 0,start,end,gene_name,GeneID,description,protein_coding,UniProtKB accession,UniRef50 cluster
181,14326371,14314551,LOC100338732,100338732,GINS complex subunit 3,1,,
182,14408555,14369895,LOC108176730,108176730,NDRG family member 4,1,,
183,14416308,14409348,LOC100338978,100338978,"SET domain containing 6, protein lysine methyl...",1,,
184,14413471,14502663,LOC100348223,100348223,CCR4-NOT transcription complex subunit 1,1,G1SRZ8,CCR4-NOT transcription complex subunit 1
185,14530282,14549184,LOC100348474,100348474,solute carrier family 38 member 7,1,G1SKK6,Sodium-coupled neutral amino acid transporter 7
186,14555621,14577537,LOC100348732,100348732,glutamic-oxaloacetic transaminase 2,1,P12345,"Aspartate aminotransferase, mitochondrial"
187,17294490,17712444,LOC100348982,100348982,cadherin 8,1,A0A5F9DGY6,Cadherin 8
188,18430070,18431011,NPM1,100328693,"nucleophosmin (nucleolar phosphoprotein B23, n...",1,B7NZF9,Nucleophosmin
189,19327295,19341992,LOC127491314,127491314,uncharacterized LOC127491314,1,,
190,19791811,19791344,LOC100340229,100340229,U1 small nuclear ribonucleoprotein C-like,1,,
