# `Protein`: uniprot, 2023-02

In [None]:
import pandas as pd

In [None]:
def _get_shortest_name(df: pd.DataFrame, column: str, new_column="name"):
    """Get a single shortest name from a column of lists.

    Everyone else became synonyms.
    """
    synonyms_list = []
    name_list = []
    for lst in df[column]:

        def shortest_name(lst: list):
            return min(lst, key=len)

        synonyms = set(lst.split(", "))
        no_space_names = [i for i in synonyms if " " not in i]
        if len(no_space_names) == 0:
            name = shortest_name(synonyms)
        else:
            name = shortest_name(no_space_names)
        name_list.append(name)
        synonyms.remove(name)
        synonyms_list.append("|".join([i for i in synonyms]))

    df[new_column] = name_list
    df[column] = synonyms_list

Files are downloaded from: https://www.uniprot.org/uniprotkb

In [None]:
# Downloaded from 2022-09-26

filepaths = {
    "human": "https://bionty-assets.s3.amazonaws.com/uniprot-human-2023-02.tsv.gz",
    "mouse": "https://bionty-assets.s3.amazonaws.com/uniprot-mouse-2023-02.tsv.gz",
}

## Human

## Curate the tables

In [None]:
for species, filepath in filepaths.items():
    print(f"Loading {species} data...")

    df = pd.read_csv(filepath, sep="\t")

    print(f"shape: {df.shape}")
    display(df.head())

    df = df.rename(
        columns={
            "Entry": "uniprotkb_id",
            "Protein names": "synonyms",
            "Length": "length",
            "Gene Names (primary)": "gene_symbol",
            "GeneID": "ncbi_gene_ids",
        }
    )

    # concatenate ncbi gene ids with |
    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].fillna("")
    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].str.rstrip(";").str.replace(";", "|")

    # pick the shortest name from synonyms as name
    # concatenate the rest synonyms with |
    df["synonyms"] = df["synonyms"].fillna("")
    _get_shortest_name(df, "synonyms")
    df = df[
        ["uniprotkb_id", "name", "length", "synonyms", "gene_symbol", "ncbi_gene_ids"]
    ]

    # sort by uniprotkb id, reset index
    df = df[~df["uniprotkb_id"].isnull()]
    df = df.sort_values("uniprotkb_id").reset_index(drop=True)

    print(f"shape: {df.shape}, unique: {df.uniprotkb_id.is_unique}")
    display(df.head())

    filename = f"df_{species}__uniprot__2023-02__Protein.parquet"
    df.to_parquet(filename)

    print(f"Wrote {filename}.")
    print("------------------------------------------------")