# Usage of UniRef search (Including ID Mapping)

In [1]:
import os
import requests
import re
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd
import pprint

In [2]:
from locus_search.id_mapping_tools import *
from locus_search.UniRef_search_tools import *

In [None]:
%cd ..

## Ensembl

In [None]:
"""
query = 'P74258'
tag_GeneID, in_NCBI, tag_Ensembl, in_Ensembl = check_external_links(query)
if (in_NCBI):   # False
    df_output_NCBI = NCBI_pipeline(tag_GeneID, scope=5, update=False)
if (in_Ensembl):    # True
    df_output_Ensembl = Ensembl_pipeline(tag_Ensembl, scope=5, update=False)
"""

In [5]:
df_Ensembl = pd.read_csv('outputs/Ensembl/gene_table/gt_synechocystis_sp_pcc_6803_gca_000009725_Chromosome.csv', header=0)
strand = df_Ensembl[df_Ensembl['gene_id'] == 'BAA18352']['strand'].item()
df_Ensembl_filtered = df_Ensembl[df_Ensembl['strand'] == strand].reset_index(drop=True)
idx = df_Ensembl_filtered[df_Ensembl_filtered['gene_id'] == 'BAA18352'].index.item()
df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))]   # same as df_outputs_Ensembl

Unnamed: 0,start,end,strand,gene_id,description
856,1934307,1935386,1,BAA18339,43 kD bacteriochlorophyll synthase subunit
857,1939865,1940506,1,BAA18345,
858,1940549,1942087,1,BAA18346,sodium-coupled permease
859,1942234,1942701,1,BAA18347,
860,1945518,1947179,1,BAA18351,glycerol-3-phosphate dehydrogenase
861,1947325,1948824,1,BAA18352,"alpha,alpha-trehalose-phosphate synthase"
862,1959436,1961079,1,BAA18363,alpha-isopropylmalate synthase
863,1962705,1965524,1,BAA18367,
864,1965485,1965949,1,BAA18368,
865,1966146,1969118,1,BAA18369,delta-1-pyrroline-5-carboxylate dehydrogenase


In [6]:
query_list = df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))]['gene_id'].to_list()

In [7]:
query_list

['BAA18339',
 'BAA18345',
 'BAA18346',
 'BAA18347',
 'BAA18351',
 'BAA18352',
 'BAA18363',
 'BAA18367',
 'BAA18368',
 'BAA18369',
 'BAA18372']

ID Mapping

In [8]:
UniProtKB_accession_list = get_UniProtKB_accession(query_list, 'Ensembl_Genomes')

Fetched: 11 / 11


In [9]:
UniProtKB_accession_list

['Q55087',
 'P74251',
 'P74252',
 'P74253',
 'P74257',
 'P74258',
 'P74269',
 'P74273',
 'P74274',
 'P74275',
 'P74278']

In [10]:
cluster_name_list =  UniRef_pipeline(UniProtKB_accession_list, identity=0.5, update=False)

- identity : A sequence identity threshold in an UniRef cluster. (one of [0.5, 0.9, 1.0], default = 0.5)

- update : Whether to search again for previously searched items. (bool, default = False)

In [11]:
cluster_name_list

['Geranylgeranyl diphosphate reductase',
 'Signal peptide protein',
 'Sodium-coupled permease',
 'CHRD domain-containing protein',
 'Glycerol-3-phosphate dehydrogenase',
 'Glucosylglycerol-phosphate synthase',
 '(R)-citramalate synthase',
 'ParA family protein',
 'Doublecortin domain-containing protein',
 'L-glutamate gamma-semialdehyde dehydrogenase',
 'Putative transposase for insertion sequence element IS4SA']

In [12]:
df_Ensembl_filtered_added_UniRef50 = df_Ensembl_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_Ensembl_filtered))].copy()
df_Ensembl_filtered_added_UniRef50['UniProtKB accession'] = UniProtKB_accession_list
df_Ensembl_filtered_added_UniRef50['UniRef50 cluster'] = cluster_name_list

In [13]:
df_Ensembl_filtered_added_UniRef50

Unnamed: 0,start,end,strand,gene_id,description,UniProtKB accession,UniRef50 cluster
856,1934307,1935386,1,BAA18339,43 kD bacteriochlorophyll synthase subunit,Q55087,Geranylgeranyl diphosphate reductase
857,1939865,1940506,1,BAA18345,,P74251,Signal peptide protein
858,1940549,1942087,1,BAA18346,sodium-coupled permease,P74252,Sodium-coupled permease
859,1942234,1942701,1,BAA18347,,P74253,CHRD domain-containing protein
860,1945518,1947179,1,BAA18351,glycerol-3-phosphate dehydrogenase,P74257,Glycerol-3-phosphate dehydrogenase
861,1947325,1948824,1,BAA18352,"alpha,alpha-trehalose-phosphate synthase",P74258,Glucosylglycerol-phosphate synthase
862,1959436,1961079,1,BAA18363,alpha-isopropylmalate synthase,P74269,(R)-citramalate synthase
863,1962705,1965524,1,BAA18367,,P74273,ParA family protein
864,1965485,1965949,1,BAA18368,,P74274,Doublecortin domain-containing protein
865,1966146,1969118,1,BAA18369,delta-1-pyrroline-5-carboxylate dehydrogenase,P74275,L-glutamate gamma-semialdehyde dehydrogenase


## NCBI

In [None]:
"""
query = 'P12345'
tag_GeneID, in_NCBI, tag_Ensembl, in_Ensembl = check_external_links(query)
if (in_NCBI):   # False
    df_output_NCBI = NCBI_pipeline(tag_GeneID, scope=5, update=False)
if (in_Ensembl):    # True
    df_output_Ensembl = Ensembl_pipeline(tag_Ensembl, scope=5, update=False)
"""

In [15]:
df_NCBI = pd.read_table('outputs/NCBI/gene_table/gt_NC_067378_1.tsv', header=0)
df_NCBI_filtered = df_NCBI[df_NCBI['protein_coding'] == 1].reset_index(drop=True)
strand = df_NCBI_filtered[(df_NCBI_filtered['GeneID'] == '100348732')].strand.item()
df_NCBI_filtered = df_NCBI_filtered[df_NCBI_filtered['strand'] == strand].reset_index(drop=True)
idx = df_NCBI_filtered[df_NCBI_filtered['GeneID'] == '100348732'].index.item()
df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))] # same as df_outputs_NCBI

Unnamed: 0,start,end,strand,gene_name,GeneID,description,protein_coding
76,14115261,14101814,-1,LOC100337719,100337719,cilia and flagella associated protein 20,1
77,14163573,14124537,-1,LOC100337973,100337973,casein kinase 2 alpha 2,1
78,14254741,14242352,-1,LOC100338480,100338480,serine protease 54,1
79,14502663,14413471,-1,LOC100348223,100348223,CCR4-NOT transcription complex subunit 1,1
80,14549184,14530282,-1,LOC100348474,100348474,solute carrier family 38 member 7,1
81,14577537,14555621,-1,LOC100348732,100348732,glutamic-oxaloacetic transaminase 2,1
82,17712444,17294490,-1,LOC100348982,100348982,cadherin 8,1
83,18431011,18430070,-1,NPM1,100328693,"nucleophosmin (nucleolar phosphoprotein B23, n...",1
84,19341992,19327295,-1,LOC127491314,127491314,uncharacterized LOC127491314,1
85,20672605,20508636,-1,LOC100349487,100349487,cadherin 11,1


In [16]:
query_list = df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))]['GeneID'].to_list()

In [17]:
query_list

['100337719',
 '100337973',
 '100338480',
 '100348223',
 '100348474',
 '100348732',
 '100348982',
 '100328693',
 '127491314',
 '100349487',
 '100341517']

In [18]:
UniProtKB_accession_list = get_UniProtKB_accession(query_list, 'GeneID')

Fetched: 7 / 7


In [19]:
UniProtKB_accession_list

['G1T6K5',
 None,
 None,
 'G1SRZ8',
 'G1SKK6',
 'P12345',
 'A0A5F9DGY6',
 'B7NZF9',
 None,
 'G1TCU4',
 None]

In [20]:
cluster_name_list =  UniRef_pipeline(UniProtKB_accession_list, identity=0.5, update=False)

- identity : A sequence identity threshold in an UniRef cluster. (one of [0.5, 0.9, 1.0], default = 0.5)

- update : Whether to search again for previously searched items. (bool, default = False)

In [21]:
cluster_name_list

['Cilia- and flagella-associated protein 20',
 None,
 None,
 'CCR4-NOT transcription complex subunit 1',
 'Sodium-coupled neutral amino acid transporter 7',
 'Aspartate aminotransferase, mitochondrial',
 'Cadherin 8',
 'Nucleophosmin',
 None,
 'Cadherin-11',
 None]

In [22]:
df_NCBI_filtered_added_UniRef50 = df_NCBI_filtered.loc[max(idx-5, 0) : min(idx+5, len(df_NCBI_filtered))].copy()
df_NCBI_filtered_added_UniRef50['UniProtKB accession'] = UniProtKB_accession_list
df_NCBI_filtered_added_UniRef50['UniRef50 cluster'] = cluster_name_list

In [23]:
df_NCBI_filtered_added_UniRef50

Unnamed: 0,start,end,strand,gene_name,GeneID,description,protein_coding,UniProtKB accession,UniRef50 cluster
76,14115261,14101814,-1,LOC100337719,100337719,cilia and flagella associated protein 20,1,G1T6K5,Cilia- and flagella-associated protein 20
77,14163573,14124537,-1,LOC100337973,100337973,casein kinase 2 alpha 2,1,,
78,14254741,14242352,-1,LOC100338480,100338480,serine protease 54,1,,
79,14502663,14413471,-1,LOC100348223,100348223,CCR4-NOT transcription complex subunit 1,1,G1SRZ8,CCR4-NOT transcription complex subunit 1
80,14549184,14530282,-1,LOC100348474,100348474,solute carrier family 38 member 7,1,G1SKK6,Sodium-coupled neutral amino acid transporter 7
81,14577537,14555621,-1,LOC100348732,100348732,glutamic-oxaloacetic transaminase 2,1,P12345,"Aspartate aminotransferase, mitochondrial"
82,17712444,17294490,-1,LOC100348982,100348982,cadherin 8,1,A0A5F9DGY6,Cadherin 8
83,18431011,18430070,-1,NPM1,100328693,"nucleophosmin (nucleolar phosphoprotein B23, n...",1,B7NZF9,Nucleophosmin
84,19341992,19327295,-1,LOC127491314,127491314,uncharacterized LOC127491314,1,,
85,20672605,20508636,-1,LOC100349487,100349487,cadherin 11,1,G1TCU4,Cadherin-11
