In [1]:
import pandas as pd
import numpy as np

### Process Human Protein Atlas annotations and create annotation files

In [2]:
proteinatlas_df = pd.read_csv("input/proteinatlas.tsv.zip", sep="\t")

In [3]:
# Interested annotations
# 'Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular main location', 'Subcellular additional location'

### Protein class

In [4]:
protein_class_df = proteinatlas_df[["Gene", "Protein class"]].rename(
    columns={"Protein class": "protein_class"}
)
protein_class_df.shape

(20162, 2)

In [5]:
protein_class_df.drop_duplicates(subset=['Gene', 'protein_class'], inplace=True)
protein_class_df.shape

(20156, 2)

In [6]:
protein_class_df

Unnamed: 0,Gene,protein_class
0,TSPAN6,"Predicted intracellular proteins, Predicted me..."
1,TNMD,Predicted membrane proteins
2,DPM1,"Disease related genes, Enzymes, Human disease ..."
3,SCYL3,"Enzymes, Predicted intracellular proteins"
4,C1orf112,Predicted intracellular proteins
...,...,...
20157,ENSG00000291313,Predicted intracellular proteins
20158,ENSG00000291314,Predicted intracellular proteins
20159,ENSG00000291315,Predicted intracellular proteins
20160,ENSG00000291316,Predicted intracellular proteins


In [7]:
# Replace "," with "|" and remove duplicates

protein_class_df = (
    protein_class_df.assign(tmp=lambda x: x["protein_class"].str.split(", "))
    .explode("tmp")
    .drop(columns="protein_class")
    .drop_duplicates(subset=["Gene", "tmp"])
    .groupby("Gene")["tmp"]
    .apply(lambda x: "|".join(np.unique(x)))
    .reset_index()
    .rename(columns={"tmp": "protein_class"})
)

protein_class_df.shape

(20151, 2)

In [8]:
protein_class_df.to_csv(
    "output/human_protein_atlas_class_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Subcellular location

In [9]:
protein_location_df = proteinatlas_df[
    ["Gene", "Subcellular main location", "Subcellular additional location"]
].rename(
    columns={
        "Subcellular main location": "subcellular_location",
        "Subcellular additional location": "subcellular_additional_location",
    }
)

protein_location_df.shape

(20162, 3)

In [10]:
# Remove na values from subcellular_location column

protein_location_df = protein_location_df.dropna(
    subset=["subcellular_location"]
).reset_index(drop=True)
protein_location_df.shape

(13146, 3)

In [11]:
# Replace "," with "|"

protein_location_df = (
    protein_location_df.assign(
        tmp=lambda x: x["subcellular_location"].str.split(", ")
    )
    .explode("tmp")
    .drop(columns="subcellular_location")
    .drop_duplicates(subset=["Gene", "tmp"])
    .groupby("Gene")["tmp"]
    .apply(lambda x: "|".join(np.unique(x)))
    .reset_index()
    .rename(columns={"tmp": "subcellular_location"})
)
protein_location_df.shape

(13140, 2)

In [12]:
protein_location_df.to_csv(
    "output/human_protein_atlas_location_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Disease annotation

In [13]:
protein_disease_df = proteinatlas_df[["Gene", "Disease involvement"]].rename(
    columns={"Disease involvement": "disease_involvement"}
)
protein_disease_df.shape

(20162, 2)

In [14]:
# Remove na values from disease_involvement column

protein_disease_df = protein_disease_df.dropna(
    subset=["disease_involvement"]
).reset_index(drop=True)
protein_disease_df.shape

(5838, 2)

In [15]:
# Replace "," with "|"

protein_disease_df = (
    protein_disease_df.assign(
        tmp=lambda x: x["disease_involvement"].str.split(", ")
    )
    .explode("tmp")
    .drop(columns="disease_involvement")
    .drop_duplicates(subset=["Gene", "tmp"])
    .groupby("Gene")["tmp"]
    .apply(lambda x: "|".join(np.unique(x)))
    .reset_index()
    .rename(columns={"tmp": "disease_involvement"})
)
protein_disease_df.shape

(5837, 2)

In [16]:
protein_disease_df.to_csv(
    "output/human_protein_atlas_disease_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Molecular function and biological process annotations

In [17]:
go_df = proteinatlas_df[["Gene", "Biological process", "Molecular function"]].rename(
    columns={"Biological process": "biological_process", "Molecular function": "molecular_function"}
)

In [18]:
# Remove na values from biological_process and molecular_function columns

go_df = go_df.dropna(
    subset=["biological_process", "molecular_function"]
).reset_index(drop=True)
go_df.shape

(7198, 3)

In [19]:
# Replace "," with "|"

go_bp_df = (
    go_df.assign(tmp=lambda x: x["biological_process"].str.split(", "))
    .explode("tmp")
    .drop(columns="biological_process")
    .drop_duplicates(subset=["Gene", "tmp"])
    .groupby("Gene")["tmp"]
    .apply(lambda x: "|".join(np.unique(x)))
    .reset_index()
    .rename(columns={"tmp": "biological_process"})
)

go_mf_df = (
    go_df.assign(tmp=lambda x: x["molecular_function"].str.split(", "))
    .explode("tmp")
    .drop(columns="molecular_function")
    .drop_duplicates(subset=["Gene", "tmp"])
    .groupby("Gene")["tmp"]
    .apply(lambda x: "|".join(np.unique(x)))
    .reset_index()
    .rename(columns={"tmp": "molecular_function"})
)

go_df = go_bp_df.merge(go_mf_df, on="Gene", how="outer")
go_df.shape

(7197, 3)

In [20]:
go_df.to_csv(
    "output/human_protein_atlas_go_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)