# Ingest Uniprot ID mappings

In [None]:
from nbproject import header

In [1]:
URLs = {
    "human": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz",
    "mouse": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/MOUSE_10090_idmapping_selected.tab.gz",
}

Column names are retrieved from: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README

Cross-referenced databases in UniProt are listed here: https://www.uniprot.org/database

In [3]:
cols = [
    "UniProtKB-AC",
    "UniProtKB-ID",
    "Entrez.gene_id",
    "RefSeq",
    "GI",
    "PDB",
    "GO",
    "UniRef100",
    "UniRef90",
    "UniRef50",
    "UniParc",
    "PIR",
    "NCBI-taxon",
    "MIM",
    "UniGene",
    "PubMed",
    "EMBL",
    "EMBL-CDS",
    "Ensembl",
    "Ensembl_TRS",
    "Ensembl_PRO",
    "Additional PubMed",
]

In [4]:
import pandas as pd

In [10]:
for species, url in URLs.items():
    print(f"Downloading {species} dataset...")
    # read in dataframe directly from the url
    df = pd.read_csv(url, sep="\t", header=None, low_memory=False, compression="gzip")

    df.columns = cols

    assert df["UniProtKB-AC"].is_unique

    df = df.set_index("UniProtKB-AC")
    print(f"Shape: {df.shape}")

    filename = f"UNIPROT-{species}_idmapping.feather"
    df.reset_index().to_feather(
        filename
    )  # `reset_index() is needed to store as feather`
    print(f"Saved to: {filename}")

Let's take a brief look at the table.

Index here is the `UniProtKB-AC`, which often referred to as the `UniProt protein ID` or `UniProtKB accession number`, it is the identifier for a protein per the UniProt database. (See: [here](https://www.wikidata.org/wiki/Property:P352))



In [11]:
df.head()

Unnamed: 0_level_0,UniProtKB-ID,Entrez.gene_id,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,...,NCBI-taxon,MIM,UniGene,PubMed,EMBL,EMBL-CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional PubMed
UniProtKB-AC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9CQV8,1433B_MOUSE,54401,NP_061223.2; XP_006499972.1,1009083986; 409974018; 74219534; 31543974; 306...,4GNT:A; 5F74:A; 5WFU:A; 5WFU:B; 5WFU:C; 5WFU:D...,GO:0005737; GO:0005829; GO:0042470; GO:0005634...,UniRef100_Q9CQV8,UniRef90_Q9CQV8,UniRef50_Q04917,UPI0000004168,...,10090,,,16141072; 9705322; 15883195; 16800626; 1860420...,AF058797; AK002632; AK004872; AK011389; AK0833...,AAC14343.1; BAB22246.1; BAB23631.1; BAB27587.1...,ENSMUSG00000018326,ENSMUST00000018470,ENSMUSP00000018470,11217851; 12008017; 12446771; 12466851; 129371...
P62259,1433E_MOUSE,22627,NP_033562.3,60391192; 226874906,,GO:0030424; GO:0090724; GO:0005737; GO:0005829...,UniRef100_P62258,UniRef90_P62258,UniRef50_P62258,UPI0000021A46,...,10090,,,7750640; 11471062; 15489334; 10409742; 1127828...,Z19599; D87663; AF483478; AF483479; BC058686,CAA79659.1; BAA13424.1; AAL90752.1; AAL90753.1...,ENSMUSG00000020849,ENSMUST00000067664,ENSMUSP00000070993,11784696; 14966136; 15364926; 17720156; 186876...
P68510,1433F_MOUSE,22629,NP_035868.1,74226056; 17976969; 1526541; 74139366; 7414068...,5YQG:A; 5YQG:B; 5YQG:C; 5YQG:D,GO:0005737; GO:0005829; GO:0014704; GO:0005886...,UniRef100_P68510,UniRef90_Q04917,UniRef50_Q04917,UPI000000109B,...,10090,,,9197417; 9738002; 12008017; 16141072; 15489334...,U57311; D87661; AF077002; AB063572; AK077596; ...,AAC53256.1; BAA13422.1; AAC36290.1; BAB79599.1...,ENSMUSG00000018965,ENSMUST00000019109,ENSMUSP00000019109,14593000; 21152247; 21291861; 22178943; 255998...
P61982,1433G_MOUSE,22628,NP_061359.2,74215924; 31543976; 48428722; 3065929,,GO:0005737; GO:0005829; GO:0043209; GO:0098793...,UniRef100_P61981,UniRef90_P61981,UniRef50_P61981,UPI000000106B,...,10090,,,16141072; 15489334; 18034455; 21183079; 204783...,AF058799; CT010208; AK088847; AK148618; AK1533...,AAC14345.1; CAJ18416.1; BAC40609.1; BAE28625.1...,ENSMUSG00000051391,ENSMUST00000055808; ENSMUST00000198270,ENSMUSP00000051223; ENSMUSP00000143631,12176032; 12694396; 12730952; 19574997; 206286...
O70456,1433S_MOUSE,55948,NP_061224.2,148698109; 18202125; 74138801; 134023662; 3388...,,GO:0005737; GO:0005829; GO:0005576; GO:0005634...,UniRef100_O70456,UniRef90_O70456,UniRef50_P31947,UPI00000253C3,...,10090,,,16141072; 19468303; 16710422; 21183079; 204783...,AF058798; AK146490; AK169358; AL627228; CH466552,AAC14344.1; BAE27209.1; BAE41107.1; -; EDL30056.1,ENSMUSG00000047281,ENSMUST00000057311,ENSMUSP00000050374,11864996; 12075357; 12077355; 14517281; 152860...


In [12]:
df.dtypes

UniProtKB-ID          object
Entrez.gene_id        object
RefSeq                object
GI                    object
PDB                   object
GO                    object
UniRef100             object
UniRef90              object
UniRef50              object
UniParc               object
PIR                   object
NCBI-taxon             int64
MIM                  float64
UniGene              float64
PubMed                object
EMBL                  object
EMBL-CDS              object
Ensembl               object
Ensembl_TRS           object
Ensembl_PRO           object
Additional PubMed     object
dtype: object