In [1]:
import pandas as pd

### Process Human Protein Atlas annotations and create annotation files

In [2]:
proteinatlas_df = pd.read_csv("input/proteinatlas.tsv.zip", sep="\t")

In [3]:
# Interested annotations
# 'Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular main location', 'Subcellular additional location'

### Protein class

In [4]:
protein_class_df = proteinatlas_df[["Gene", "Protein class"]].rename(
    columns={"Protein class": "protein_class"}
)
protein_class_df.shape

(20162, 2)

In [5]:
protein_class_df.drop_duplicates(subset=['Gene', 'protein_class'], inplace=True)
protein_class_df.shape

(20156, 2)

In [6]:
# Replace "," with "|"

protein_class_df = protein_class_df.assign(
    protein_class=lambda x: x["protein_class"].str.split(", ").str.join("|")
)
protein_class_df.shape

(20156, 2)

In [7]:
protein_class_df.to_csv(
    "output/human_protein_atlas_class_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Subcellular location

In [8]:
protein_location_df = proteinatlas_df[
    ["Gene", "Subcellular main location", "Subcellular additional location"]
].rename(
    columns={
        "Subcellular main location": "subcellular_location",
        "Subcellular additional location": "subcellular_additional_location",
    }
)

protein_location_df.shape

(20162, 3)

In [9]:
# Remove na values from subcellular_location column

protein_location_df = protein_location_df.dropna(
    subset=["subcellular_location"]
).reset_index(drop=True)
protein_location_df.shape

(13146, 3)

In [10]:
# Replace "," with "|"

protein_location_df = protein_location_df.assign(
    subcellular_location=lambda x: x["subcellular_location"].str.split(", ").str.join("|")
)
protein_location_df.shape

(13146, 3)

In [11]:
protein_location_df.to_csv(
    "output/human_protein_atlas_location_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Disease annotation

In [12]:
protein_disease_df = proteinatlas_df[["Gene", "Disease involvement"]].rename(
    columns={"Disease involvement": "disease_involvement"}
)
protein_disease_df.shape

(20162, 2)

In [13]:
# Remove na values from disease_involvement column

protein_disease_df = protein_disease_df.dropna(
    subset=["disease_involvement"]
).reset_index(drop=True)
protein_disease_df.shape

(5838, 2)

In [14]:
# Replace "," with "|"

protein_disease_df = protein_disease_df.assign(
    disease_involvement=lambda x: x["disease_involvement"].str.split(", ").str.join("|")
)
protein_disease_df.shape

(5838, 2)

In [15]:
protein_disease_df.to_csv(
    "output/human_protein_atlas_disease_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)

### Molecular function and biological process annotations

In [16]:
go_df = proteinatlas_df[["Gene", "Biological process", "Molecular function"]].rename(
    columns={"Biological process": "biological_process", "Molecular function": "molecular_function"}
)

In [17]:
# Remove na values from biological_process and molecular_function columns

go_df = go_df.dropna(
    subset=["biological_process", "molecular_function"]
).reset_index(drop=True)
go_df.shape

(7198, 3)

In [18]:
# Replace "," with "|"

go_df = go_df.assign(
    biological_process=lambda x: x["biological_process"].str.split(", ").str.join("|"),
    molecular_function=lambda x: x["molecular_function"].str.split(", ").str.join("|"),
)

go_df.shape

(7198, 3)

In [19]:
go_df.to_csv(
    "output/human_protein_atlas_go_annotations.tsv.gz",
    sep="\t",
    index=False,
    compression="gzip",
)