# Import libraries and data

In [1]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [2]:
data_path = os.getcwd() + "/Datasets/Brain/"

## Brain elevated proteome

In [3]:
brain_elevated = pd.read_csv(data_path + "HPA_brain_elevated_version21.tsv", sep="\t")
print("Number of brain elevated proteins according to the HPA: %i" % len(brain_elevated))
brain_elevated.dropna(subset=["Uniprot"], inplace=True)
brain_elevated.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of brain elevated proteins with a Uniprot ID: %i" % len(brain_elevated))

Number of brain elevated proteins according to the HPA: 2709
Number of brain elevated proteins with a Uniprot ID: 2546


## Brain detected proteome

In [4]:
brain_detected = pd.read_csv(data_path + "HPA_brain_detected_version21.tsv", sep="\t", low_memory=False)
print("Number of brain detected proteins according to the HPA: %i" % len(brain_detected))
brain_detected.dropna(subset=["Uniprot"], inplace=True)
brain_detected.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of brain detected proteins with a Uniprot ID: %i" % len(brain_detected))

Number of brain detected proteins according to the HPA: 16507
Number of brain detected proteins with a Uniprot ID: 16021


## Brain exclusive proteome

In [5]:
brain_excusive = pd.read_csv(data_path + "HPA_brain_exclusive_version21.tsv", sep="\t")
print("Number of only brain detected proteins according to the HPA: %i" % len(brain_excusive))
brain_excusive.dropna(subset=["Uniprot"], inplace=True)
brain_excusive.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of only brain detected proteins with a Uniprot ID: %i" % len(brain_excusive))

Number of only brain detected proteins according to the HPA: 204
Number of only brain detected proteins with a Uniprot ID: 169


# Data curation

In [6]:
def keep_first_uniprot(string):
    if "," in string:
        uniprots = string.split(",")
        uniprot1 = uniprots[0]
    else:
        uniprot1 = string
    
    return uniprot1

def get_brain_expression(string):
    # check if expression for multiple tissues is provided
    if ";" in string:
        tissues = string.split(";")
        for t in tissues:
            # keep only information on brain expression
            if "brain" in t:
                brain_string = t 
    else:
        brain_string = string
    
    # extract expression value from string
    _, exp = brain_string.split(" ")
    exp = float(exp)

    return exp

In [7]:
# if several Uniprots are associated, keep only first
brain_elevated["Uniprot"] = brain_elevated["Uniprot"].apply(keep_first_uniprot)
brain_detected["Uniprot"] = brain_detected["Uniprot"].apply(keep_first_uniprot)
brain_excusive["Uniprot"] = brain_excusive["Uniprot"].apply(keep_first_uniprot)

In [8]:
# add column with the mRNA brain expression
brain_elevated["Brain expression"] = brain_elevated["RNA tissue specific nTPM"].apply(get_brain_expression)

In [9]:
brain_regions = ["cerebellum",
"hypothalamus",
"white matter",
"hippocampal formation",
"basal ganglia",
"midbrain",
"spinal cord",
"medulla oblongata",
"amygdala",
"cerebral cortex",
"hippocampal formation",
"medulla oblongata"]

for i in brain_regions:
    print(i, brain_elevated["RNA brain regional specific nTPM"].str.contains(i).sum())

cerebellum 117
hypothalamus 55
white matter 24
hippocampal formation 18
basal ganglia 30
midbrain 34
spinal cord 5
medulla oblongata 21
amygdala 7
cerebral cortex 72
hippocampal formation 18
medulla oblongata 21


## Save curates data sets

In [10]:
# save dataframe
brain_elevated.to_csv(data_path + "Brain_elevated.csv", index=False)

# save Uniprot list
with open(data_path + "Brain_elevated_Uniprot.txt", "w") as f:
    for item in brain_elevated["Uniprot"]:
        f.write("%s\n" % item)

In [11]:
# save dataframe
brain_detected.to_csv(data_path + "Brain_detected.csv", index=False)

# save Uniprot list
with open(data_path + "Brain_detected_Uniprot.txt", "w") as f:
    for item in brain_detected["Uniprot"]:
        f.write("%s\n" % item)

In [12]:
# save dataframe
brain_excusive.to_csv(data_path + "Brain_exclusive.csv", index=False)

# save Uniprot list
with open(data_path + "Brain_exclusive_Uniprot.txt", "w") as f:
    for item in brain_excusive["Uniprot"]:
        f.write("%s\n" % item)