# `Protein`: uniprot, 2023-02

In [1]:
import pandas as pd

In [2]:
def _get_shortest_name(df: pd.DataFrame, column: str, new_column="name"):
    """Get a single shortest name from a column of lists.

    Everyone else became synonyms.
    """
    synonyms_list = []
    name_list = []
    for lst in df[column]:

        def shortest_name(lst: list):
            return min(lst, key=len)

        synonyms = set(lst.split(", "))
        no_space_names = [i for i in synonyms if " " not in i]
        if len(no_space_names) == 0:
            name = shortest_name(synonyms)
        else:
            name = shortest_name(no_space_names)
        name_list.append(name)
        synonyms.remove(name)
        synonyms_list.append("|".join([i for i in synonyms]))

    df[new_column] = name_list
    df[column] = synonyms_list

Files are downloaded from: https://www.uniprot.org/uniprotkb

In [3]:
# Downloaded from 2022-09-26

filepaths = {
    "human": "https://bionty-assets.s3.amazonaws.com/uniprot-human.tsv.gz",
    "mouse": "https://bionty-assets.s3.amazonaws.com/uniprot-mouse.tsv.gz",
}

## Human

## Curate the tables

In [4]:
for species, filepath in filepaths.items():
    print(f"Loading {species} data...")

    df = pd.read_csv(filepath, sep="\t")

    print(f"shape: {df.shape}")
    display(df.head())

    df = df.drop(
        columns=["Entry Name", "Organism (ID)", "Gene Names (synonym)", "Ensembl"],
        errors="ignore",
    )
    df = df.rename(
        columns={
            "Entry": "uniprotkb_id",
            "Protein names": "synonyms",
            "Length": "length",
            "Gene Names (primary)": "gene_symbol",
            "GeneID": "ncbi_gene_ids",
        }
    )

    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].fillna("")
    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].str.rstrip(";").str.replace(";", "|")

    df["synonyms"] = df["synonyms"].fillna("")
    _get_shortest_name(df, "synonyms")
    df = df[
        ["uniprotkb_id", "name", "length", "synonyms", "gene_symbol", "ncbi_gene_ids"]
    ]
    df = df[~df["uniprotkb_id"].isnull()]
    df = df.sort_values("uniprotkb_id").reset_index(drop=True)

    print(f"shape: {df.shape}, unique: {df.uniprotkb_id.is_unique}")
    display(df.head())

    filename = f"{species}_uniprot_2023-02_Protein_lookup.parquet"
    df.to_parquet(filename)

    print(f"Wrote {filename}.")
    print("------------------------------------------------")

Loading human data...
shape: (204961, 9)


Unnamed: 0,Entry,Entry Name,Protein names,Length,Organism (ID),Gene Names (primary),Gene Names (synonym),Ensembl,GeneID
0,A0A024QZ08,A0A024QZ08_HUMAN,Intraflagellar transport 20 homolog (Chlamydom...,132,9606,IFT20,,,90410;
1,A0A024QZ86,A0A024QZ86_HUMAN,"T-box 2, isoform CRA_a",712,9606,TBX2,,,6909;
2,A0A024QZA8,A0A024QZA8_HUMAN,"Receptor protein-tyrosine kinase, EC 2.7.10.1",976,9606,EPHA2,,,1969;
3,A0A024QZB8,A0A024QZB8_HUMAN,Battenin,438,9606,CLN3,,,1201;
4,A0A024QZQ1,A0A024QZQ1_HUMAN,Sirtuin (Silent mating type information regula...,747,9606,SIRT1,,,23411;


shape: (204961, 6), unique: True


Unnamed: 0,uniprotkb_id,name,length,synonyms,gene_symbol,ncbi_gene_ids
0,A0A023HHK9,EC 1.14.11.n2,1305,Methylcytosine dioxygenase TET,,
1,A0A023HHL0,EC 1.14.11.n2,694,Methylcytosine dioxygenase TET,,
2,A0A023HJ61,HRES-1/RAB4 variant,121,,RAB4A,
3,A0A023HN28,SRSF3/USP6 fusion protein,16,,,
4,A0A023I7F4,Cytochrome b,380,,CYTB,


Wrote human_uniprot_2023-02_Protein_lookup.parquet.
------------------------------------------------
Loading mouse data...
shape: (86436, 9)


Unnamed: 0,Entry,Entry Name,Protein names,Length,Organism (ID),Gene Names (primary),Gene Names (synonym),Ensembl,GeneID
0,A0A075F5C6,A0A075F5C6_MOUSE,Heat shock factor protein 1 (Heat shock transc...,531,10090,Hsf1,,ENSMUST00000228371.2;,15499;
1,A0A087WPF7,AUTS2_MOUSE,Autism susceptibility gene 2 protein homolog,1261,10090,Auts2,Kiaa0442,ENSMUST00000161226 [A0A087WPF7-1];ENSMUST00000...,
2,A0A087WPT2,A0A087WPT2_MOUSE,Prostaglandin G/H synthase 2,62,10090,Ptgs2,,ENSMUST00000190784.2;,
3,A0A087WPU4,A0A087WPU4_MOUSE,FAT atypical cadherin 1,159,10090,Fat1,,ENSMUST00000186342.3;,
4,A0A087WRK1,A0A087WRK1_MOUSE,"Predicted gene, 20814 (Predicted gene, 20850) ...",222,10090,Gm20850,Gm20814 Gm20835 Gm20855 Gm20869 Gm20870 Gm2088...,ENSMUST00000185240.2;ENSMUST00000185245.2;ENSM...,100042201;100042279;100042594;100861691;108167...


shape: (86436, 6), unique: True


Unnamed: 0,uniprotkb_id,name,length,synonyms,gene_symbol,ncbi_gene_ids
0,A0A023JDV8,Creatine transporter SLC6A8 variant D,224,,Slc6a8,
1,A0A023NCR8,Cytochrome b (Complex III subunit 3) (Complex ...,233,,cytB,
2,A0A023NCS0,Cytochrome b (Complex III subunit 3) (Complex ...,222,,cytB,
3,A0A023ND59,Cytochrome b (Complex III subunit 3) (Complex ...,227,,cytB,
4,A0A023NDP0,Cytochrome b (Complex III subunit 3) (Complex ...,242,,cytB,


Wrote mouse_uniprot_2023-02_Protein_lookup.parquet.
------------------------------------------------
