# Import libraries and data

In [1]:
import json
import numpy as np
import os
import pandas as pd

from Bio import SeqIO
from utils import get_uniprot, get_value, keep_first_uniprot, protein_analysis

In [2]:
data_path = os.getcwd() + "/Datasets/"

## Curated CSF data set

In [3]:
csf = pd.read_csv(data_path + "CSF/csf.csv")

## Uniprot sequences of the human proteome

In [4]:
df = pd.read_csv(data_path + "Uniprot/Human_proteome_Uniprot_seq.tab", sep="\t", header=0, names=["Uniprot", "Sequence"])
# drop entries without sequence (obsolete)
df.dropna(subset=["Sequence"], inplace=True) # none dropped
# drop entries with non-standard amino acids
df = df[df["Sequence"].str.contains("B|U|X") == False] # 25 entries dropped

## Brain

In [5]:
brain_detected = pd.read_csv(data_path + "Brain/Brain_detected.csv")
print("Number of brain detected proteins:", len(brain_detected))
brain_elevated = pd.read_csv(data_path + "Brain/Brain_elevated.csv")
print("Number of brain elevated proteins:", len(brain_elevated))

Number of brain detected proteins: 16021
Number of brain elevated proteins: 2546


## ProteomicsDB evidence

In [6]:
with open(data_path + "ProteomicsDB/ProteomicsDB_evidence_positive.txt") as f:
    ProteomicsDB_evidence_positive = json.load(f)

# Feature generation

## Sequence length

In [7]:
df["Length"] = df["Sequence"].apply(len)

## Amino acid composition & attributes

In [8]:
# df = df.apply(protein_analysis, seq_col="Sequence", axis=1)

In [9]:
#### TO DO ####
# drop unused columns

# save or load dataframe
# df.to_csv(data_path + "Features/df_features_PA_human_proteome.csv", index=False)
df = pd.read_csv(data_path + "Features/df_features_PA_human_proteome.csv") 
df[:5]

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Polarity_large,Polarizability_low,Polarizability_medium,Polarizability_large,Charge_positive,Charge_neutral,Charge_negative,Buried,Exposed,Intermediate
0,Q9NWW9,MALARPRPRLGDLIEISRFGYAHWAIYVGDGYVVHLAPASEIAGAG...,162,17393.648,0.12963,0.006173,0.04321,0.049383,0.012346,0.08642,...,0.302469,0.37037,0.41358,0.216049,0.117284,0.790123,0.092593,0.5,0.234568,0.228395
1,Q99569,MPAPEQASLVEEGQPQTRQEAASTGPGMEPETTATTILASVKEQEL...,1192,131866.7406,0.064597,0.011745,0.04698,0.050336,0.01594,0.061242,...,0.342282,0.361577,0.420302,0.218121,0.113255,0.78943,0.097315,0.344799,0.315436,0.334732
2,Q02325,MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGPSLFSVTKKQLGAGSR...,96,10970.5316,0.0625,0.041667,0.041667,0.114583,0.052083,0.052083,...,0.40625,0.260417,0.46875,0.270833,0.145833,0.697917,0.15625,0.416667,0.354167,0.197917
3,Q6UXB8,MHGSCSFLMLLLPLLLLLVATTGPVGALTDEEKRLMVELHNLYRAQ...,463,49470.5083,0.097192,0.025918,0.034557,0.084233,0.021598,0.071274,...,0.285097,0.362851,0.453564,0.183585,0.075594,0.805616,0.11879,0.416847,0.315335,0.332613
4,P62195,MALDGPEQMELEEGKAGSGLRQYYLSKIEELQLIVNDKSQNLRRLQ...,406,45625.5008,0.061576,0.007389,0.054187,0.093596,0.022167,0.071429,...,0.394089,0.268473,0.480296,0.251232,0.147783,0.704433,0.147783,0.423645,0.349754,0.20197


## Solubility

In [10]:
weights = {"A": 0.8356471476582918, "C": 0.5208088354857734, "U": 0.5208088354857734, "E": 0.9876987431418378,
           "D": 0.9079044671339564, "G": 0.7997168496420723, "F": 0.5849790194237692, "I": 0.6784124413866582,
           "H": 0.8947913996466419, "K": 0.9267104557513497, "L": 0.6554221515081433, "M": 0.6296623675420369, 
           "N": 0.8597433107431216, "Q": 0.789434648348208, "P": 0.8235328714705341, "S": 0.7440908318492778,
           "R": 0.7712466317693457, "T": 0.8096922697856334, "W": 0.6374678690957594, "V": 0.7357837119163659,
           "Y": 0.6112801822947587}

A = 81.0581
B = -62.7775

def sol(seq):
    SWI = np.mean(([weights[i] for i in seq]))
    sol = 1/(1 + np.exp(-(81.0581*SWI + -62.7775)))
    return sol

df["Solubility"] = df["Sequence"].apply(sol)

In [11]:
df.columns

Index(['Uniprot', 'Sequence', 'Length', 'Molecular weight', 'A', 'C', 'D', 'E',
       'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',
       'W', 'Y', 'Isoelectric point', 'Instability index', 'Polar', 'Neutral',
       'Hydrophobic', 'Volume_small', 'Volume_medium', 'Volume_large',
       'Polarity_low', 'Polarity_medium', 'Polarity_large',
       'Polarizability_low', 'Polarizability_medium', 'Polarizability_large',
       'Charge_positive', 'Charge_neutral', 'Charge_negative', 'Buried',
       'Exposed', 'Intermediate', 'Solubility'],
      dtype='object')

## Structural features (NetSurfP-2.0)

In [12]:
#### TO DO ####
# do global feature generation from original results dataframe and save df

nsp_features = pd.read_csv(data_path + "Features/features_human_proteome_no_filtering.csv")
nsp_features = nsp_features[["id", "disorder", "helix", "turn", "sheet"]]
nsp_features.columns = ["Uniprot", "Disorder_NSP", "Helix_NSP", "Turn_NSP", "Sheet_NSP"]

In [13]:
# add structural features to feature dataframe
df = df.merge(nsp_features, on="Uniprot", how="inner")

## Signal peptide (SignalP-6.0)

In [14]:
signalp = pd.read_csv(data_path + "Features/SignalP_results_human_proteome.txt", sep="\t", index_col=False, header=None, 
    skiprows=2, names=["Uniprot", "Prediction", "Likelihood-Other", "Likelihood-SP", "CS Position"])

# retrieve Uniprot ID
signalp["Uniprot"] = signalp["Uniprot"].apply(get_uniprot)
signalp_pos = signalp[signalp["Prediction"] == "SP"]
signalp_pos[:5]

Unnamed: 0,Uniprot,Prediction,Likelihood-Other,Likelihood-SP,CS Position
4,P22223,SP,0.000224,0.999762,CS pos: 24-25. Pr: 0.7575
5,Q9BXJ4,SP,0.000201,0.999764,CS pos: 22-23. Pr: 0.9800
6,P09871,SP,0.000226,0.999694,CS pos: 15-16. Pr: 0.9805
7,Q9ULX7,SP,0.00022,0.999737,CS pos: 18-19. Pr: 0.9241
26,Q16787,SP,0.002463,0.99751,CS pos: 36-37. Pr: 0.7144


In [15]:
df["Signal peptide"] = np.where(df["Uniprot"].isin(signalp_pos["Uniprot"]), 1, 0)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Charge_negative,Buried,Exposed,Intermediate,Solubility,Disorder_NSP,Helix_NSP,Turn_NSP,Sheet_NSP,Signal peptide
0,Q9NWW9,MALARPRPRLGDLIEISRFGYAHWAIYVGDGYVVHLAPASEIAGAG...,162,17393.6480,0.129630,0.006173,0.043210,0.049383,0.012346,0.086420,...,0.092593,0.500000,0.234568,0.228395,0.641053,0.067901,0.382716,0.425926,0.191358,0
1,Q99569,MPAPEQASLVEEGQPQTRQEAASTGPGMEPETTATTILASVKEQEL...,1192,131866.7406,0.064597,0.011745,0.046980,0.050336,0.015940,0.061242,...,0.097315,0.344799,0.315436,0.334732,0.619151,0.397651,0.328859,0.668624,0.002517,0
2,Q02325,MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGPSLFSVTKKQLGAGSR...,96,10970.5316,0.062500,0.041667,0.041667,0.114583,0.052083,0.052083,...,0.156250,0.416667,0.354167,0.197917,0.641370,0.302083,0.125000,0.604167,0.270833,1
3,Q6UXB8,MHGSCSFLMLLLPLLLLLVATTGPVGALTDEEKRLMVELHNLYRAQ...,463,49470.5083,0.097192,0.025918,0.034557,0.084233,0.021598,0.071274,...,0.118790,0.416847,0.315335,0.332613,0.731990,0.373650,0.144708,0.762419,0.092873,1
4,P62195,MALDGPEQMELEEGKAGSGLRQYYLSKIEELQLIVNDKSQNLRRLQ...,406,45625.5008,0.061576,0.007389,0.054187,0.093596,0.022167,0.071429,...,0.147783,0.423645,0.349754,0.201970,0.769943,0.096059,0.458128,0.399015,0.142857,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20298,Q96T66,MKSRIPVVLLACGSFNPITNMHLRMFEVARDHLHQTGMYQVIQGII...,252,28321.3452,0.071429,0.011905,0.043651,0.047619,0.019841,0.067460,...,0.091270,0.412698,0.297619,0.297619,0.653012,0.158730,0.380952,0.500000,0.119048,0
20299,Q96NY8,MPLSLGAEMWGPEAWLLLLLLLASFTGRCPAGELETSDVVTVVLGQ...,510,55453.6413,0.064706,0.017647,0.043137,0.084314,0.021569,0.086275,...,0.127451,0.425490,0.290196,0.290196,0.651683,0.368627,0.017647,0.643137,0.339216,1
20300,P07197,MSYTLDSLGNPSAYRRVTETRSSFSRVSGSPSSGFRSQSWSRGSPS...,916,102470.7832,0.075328,0.001092,0.038210,0.201965,0.012009,0.055677,...,0.240175,0.310044,0.462882,0.227074,0.990665,0.556769,0.342795,0.640830,0.016376,0
20301,Q69YL0,MVLRRLLAALLHSPQLVERLSESRPIRRAAQLTAFALLQAQLRGQD...,99,10890.4720,0.141414,0.010101,0.030303,0.040404,0.030303,0.090909,...,0.070707,0.484848,0.232323,0.171717,0.429929,0.171717,0.707071,0.292929,0.000000,0


## Glycosylation prediction

### NetNglyc-1.0

In [16]:
#### TO DO ####
# move to utils

def netNglyc_filter(file, file_name):
    """
    """
    # open results file of netNglyc predictions
    results = open(file, "r")
    lines = results.readlines()
    
    # open new file to save filtered lines to
    filtered_results = open(data_path + "Features/" + file_name + ".txt", "w+")
    
    for line in lines:
        # save relevant lines to new file
        if line[:3] == "sp|":
            filtered_results.writelines(line)
    
    # close file
    filtered_results.close()
    
    return None
    
def split_netNglyc(df):
    """
    """
    string = df[0]
    
    # retrieve information from first column
    name, pos, seq = string.split()

    # retrieve Uniprot ID from description
    uniprot = get_uniprot(name)
    
    df["Uniprot"] = uniprot
    df["Position"] = pos
    df["Sequence"] = seq
    
    # drop old column
    df.drop(columns=[0], axis=1, inplace=True)
    
    # reorder columns
    df = df[["Uniprot", "Position", "Sequence", "Potential", "Jury agreement", "Result"]]
    
    return df

In [17]:
# # filter netNglyc results file
# netNglyc_filter(data_path + "Features/NetNglyc_results_human_proteome.out", "NetNglyc_results_human_proteome_filtered")

# # create clean dataframe of glycosylation prediction results
# netnglyc = pd.read_csv(data_path + "Features/NetNglyc_results_human_proteome_filtered.txt", sep="\t", header=None) 
# netnglyc.dropna(axis=1, how="all", inplace=True)
# netnglyc.columns = [0, "Potential", "Jury agreement", "Result"]
# netnglyc = netnglyc.apply(split_netNglyc, axis=1)
# netnglyc

In [18]:
# save or load dataframe
# netnglyc.to_csv(data_path + "Features/NetNglyc.csv", index=False)
netnglyc = pd.read_csv(data_path + "Features/NetNglyc.csv") 

In [19]:
# filter for predicted glycosylation sites, "-" means predicted negative site
netnglyc_pos = netnglyc[netnglyc["Result"].str.contains("+++", regex=False)]
netnglyc_pos[:5]

Unnamed: 0,Uniprot,Position,Sequence,Potential,Jury agreement,Result
35,Q9ULX7,213,NGSL,0.8029,(9/9),+++
42,Q01518,358,NTTL,0.7677,(9/9),+++
72,Q16787,142,NLTL,0.758,(9/9),+++
92,P55268,248,NLTR,0.791,(9/9),+++
102,Q9BYZ2,39,NGTW,0.7504,(9/9),+++


In [20]:
glyc_sites = pd.DataFrame(netnglyc_pos["Uniprot"].value_counts(), index=None).reset_index()
glyc_sites.columns = ["Uniprot", "Glycosylation"]

In [21]:
# add glycosylation sites as a binary feature
df["NetNGlyc"] = np.where(df["Uniprot"].isin(set(glyc_sites["Uniprot"])), 1, 0)

### GlycoMine

Source: https://glycomine.erc.monash.edu/Lab/GlycoMine/

In [22]:
glycomine_n = pd.read_csv(data_path + "Features/GlycoMine_N_results.txt", sep=" ") 
glycomine_n_pos = glycomine_n[glycomine_n["Value"] > 0.5]

glycomine_o = pd.read_csv(data_path + "Features/GlycoMine_O_results.zip", sep=" ") # text file too big for GitHub
glycomine_o_pos = glycomine_o[glycomine_o["Value"] > 0.502]

glycomine_c = pd.read_csv(data_path + "Features/GlycoMine_C_results.txt", sep=" ") 
glycomine_c_pos = glycomine_c[glycomine_c["Value"] > 0.555]

In [23]:
df["GlycoMine_N"] = np.where(df["Uniprot"].isin(set(glycomine_n_pos["UniProtID"])), 1, 0)
df["GlycoMine_O"] = np.where(df["Uniprot"].isin(set(glycomine_o_pos["UniProtID"])), 1, 0)
df["GlycoMine_C"] = np.where(df["Uniprot"].isin(set(glycomine_c_pos["UniProtID"])), 1, 0)

## Subcellular location prediction (DeepLoc-1.0)

In [24]:
deeploc = pd.read_csv(data_path + "Features/DeepLoc_results_human_proteome.txt", sep="\t")
deeploc.rename(columns={"ID":"Uniprot"}, inplace=True)

# retrieve Uniprot
deeploc["Uniprot"] = deeploc["Uniprot"].apply(get_uniprot)

In [25]:
deeploc["Location"].value_counts(dropna=False)

Cytoplasm                5602
Nucleus                  5430
Cell_membrane            3538
Extracellular            2034
Mitochondrion            1563
Endoplasmic_reticulum    1342
Golgi_apparatus           454
Peroxisome                160
Lysosome/Vacuole          136
Plastid                   117
Name: Location, dtype: int64

In [26]:
# add subcellular locations as binary features
for i in deeploc["Location"].unique():
    deeploc_subset = deeploc[deeploc["Location"] == i]
    df[i] = np.where(df["Uniprot"].isin(deeploc_subset["Uniprot"]), 1, 0)

## Transmembrane prediction (TMHMM-2.0)
https://services.healthtech.dtu.dk/service.php?TMHMM-2.0

In [27]:
#### TO DO ####
# currently only results for brain detected not full human proteome, redo for entire human proteome?

# TMHMM results had to be split up as webserver only allows 10000 sequences at once
tmhmm_A = pd.read_csv(data_path + "Features/TMHMM_results_brain_detected_A.txt", header=None, sep="\t", 
    names=["Uniprot", "Length", "ExpAA", "First60ExpAA", "PredHel", "Topology"])
tmhmm_B = pd.read_csv(data_path + "Features/TMHMM_results_brain_detected_B.txt", header=None, sep="\t", 
    names=["Uniprot", "Length", "ExpAA", "First60ExpAA", "PredHel", "Topology"])

In [28]:
tmhmm_A["Uniprot"] = tmhmm_A["Uniprot"].apply(get_uniprot)
tmhmm_A["Length"] = tmhmm_A["Length"].apply(get_value)
tmhmm_A["ExpAA"] = tmhmm_A["ExpAA"].apply(get_value)
tmhmm_A["First60ExpAA"] = tmhmm_A["First60ExpAA"].apply(get_value)
tmhmm_A["PredHel"] = tmhmm_A["PredHel"].apply(get_value)

tmhmm_B["Uniprot"] = tmhmm_B["Uniprot"].apply(get_uniprot)
tmhmm_B["Length"] = tmhmm_B["Length"].apply(get_value)
tmhmm_B["ExpAA"] = tmhmm_B["ExpAA"].apply(get_value)
tmhmm_B["First60ExpAA"] = tmhmm_B["First60ExpAA"].apply(get_value)
tmhmm_B["PredHel"] = tmhmm_B["PredHel"].apply(get_value)

In [29]:
# combine results into one dataframe
tmhmm = pd.concat([tmhmm_A, tmhmm_B], axis=0)

In [30]:
#### TO DO ####
# check if accuracy drops if we remove some of these features

df = df.merge(tmhmm[["Uniprot", "ExpAA", "First60ExpAA", "PredHel"]], on="Uniprot", how="left")
df.fillna(0, inplace=True)
# create a binary feature for transmembrane regions
df["PredHel_binary"] = np.where(df["PredHel"] == 0, 0, 1)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Cell_membrane,Endoplasmic_reticulum,Mitochondrion,Golgi_apparatus,Peroxisome,Plastid,ExpAA,First60ExpAA,PredHel,PredHel_binary
0,Q9NWW9,MALARPRPRLGDLIEISRFGYAHWAIYVGDGYVVHLAPASEIAGAG...,162,17393.6480,0.129630,0.006173,0.043210,0.049383,0.012346,0.086420,...,0,0,1,0,0,0,26.95,4.17,1.0,1
1,Q99569,MPAPEQASLVEEGQPQTRQEAASTGPGMEPETTATTILASVKEQEL...,1192,131866.7406,0.064597,0.011745,0.046980,0.050336,0.015940,0.061242,...,0,0,0,0,0,0,0.00,0.00,0.0,0
2,Q02325,MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGPSLFSVTKKQLGAGSR...,96,10970.5316,0.062500,0.041667,0.041667,0.114583,0.052083,0.052083,...,0,0,0,0,0,0,0.00,0.00,0.0,0
3,Q6UXB8,MHGSCSFLMLLLPLLLLLVATTGPVGALTDEEKRLMVELHNLYRAQ...,463,49470.5083,0.097192,0.025918,0.034557,0.084233,0.021598,0.071274,...,1,0,0,0,0,0,26.29,22.27,1.0,1
4,P62195,MALDGPEQMELEEGKAGSGLRQYYLSKIEELQLIVNDKSQNLRRLQ...,406,45625.5008,0.061576,0.007389,0.054187,0.093596,0.022167,0.071429,...,0,0,0,0,0,0,0.00,0.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20298,Q96T66,MKSRIPVVLLACGSFNPITNMHLRMFEVARDHLHQTGMYQVIQGII...,252,28321.3452,0.071429,0.011905,0.043651,0.047619,0.019841,0.067460,...,0,0,1,0,0,0,0.72,0.72,0.0,0
20299,Q96NY8,MPLSLGAEMWGPEAWLLLLLLLASFTGRCPAGELETSDVVTVVLGQ...,510,55453.6413,0.064706,0.017647,0.043137,0.084314,0.021569,0.086275,...,1,0,0,0,0,0,0.00,0.00,0.0,0
20300,P07197,MSYTLDSLGNPSAYRRVTETRSSFSRVSGSPSSGFRSQSWSRGSPS...,916,102470.7832,0.075328,0.001092,0.038210,0.201965,0.012009,0.055677,...,0,0,0,0,0,0,0.00,0.00,0.0,0
20301,Q69YL0,MVLRRLLAALLHSPQLVERLSESRPIRRAAQLTAFALLQAQLRGQD...,99,10890.4720,0.141414,0.010101,0.030303,0.040404,0.030303,0.090909,...,0,0,1,0,0,0,0.00,0.00,0.0,0


## GPI-Anchor prediction (NetGPI-1.1)

In [31]:
#### TO DO ####
# currently only results for brain detected not full human proteome, redo for entire human proteome?

# NetGPI results had to be split up as webserver only allows 5000 sequences at once
netgpi_A = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_A.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_B = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_B.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_C = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_C.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_D = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_D.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 

In [32]:
# combine results into one dataframe
netgpi = pd.concat([netgpi_A, netgpi_B, netgpi_C, netgpi_D], axis=0)
netgpi["Uniprot"] = netgpi["Uniprot"].apply(get_uniprot)

# keep only proteins predicted to have GPI anchor
netgpi_pos = netgpi[netgpi["Result"] == "GPI-Anchored"]

In [33]:
df["GPI-anchor"] = np.where(df["Uniprot"].isin(netgpi_pos["Uniprot"]), 1, 0)

##  Domains

In [34]:
#### TO DO ####
# move to utils
# sort motifs

def read_uniprot_list(file):
    file = open(data_path + "Features/" + file, "r")
    lines = file.readlines()
    uniprots = []
    
    for line in lines:
        line_strip = line.strip()
        uniprots.append(line_strip)
        
    return uniprots  

### Cadherin-1 (PS00232)

In [35]:
PS00232 = read_uniprot_list("PS00232.txt")
df["PS00232"] = np.where(df["Uniprot"].isin(PS00232), 1, 0)
df["PS00232"].value_counts()

0    20253
1       50
Name: PS00232, dtype: int64

### G-protein receptor F1 (PS00237)

In [36]:
PS00237 = read_uniprot_list("PS00237.txt")
df["PS00237"] = np.where(df["Uniprot"].isin(PS00237), 1, 0)
df["PS00237"].value_counts()

0    20215
1       88
Name: PS00237, dtype: int64

### Homeobox (PS00027)

In [37]:
PS00027 = read_uniprot_list("PS00027.txt")
df["PS00027"] = np.where(df["Uniprot"].isin(PS00027), 1, 0)
df["PS00027"].value_counts()

0    20237
1       66
Name: PS00027, dtype: int64

### Zinc Finger C2H2 (PS00028)

In [38]:
PS00028 = read_uniprot_list("PS00028.txt")
df["PS00028"] = np.where(df["Uniprot"].isin(PS00028), 1, 0)
df["PS00028"].value_counts()

0    20245
1       58
Name: PS00028, dtype: int64

### EGF1 (PS00022)

In [39]:
PS00022 = read_uniprot_list("PS00022.txt")
df["PS00022"] = np.where(df["Uniprot"].isin(PS00022), 1, 0)
df["PS00022"].value_counts()

0    20258
1       45
Name: PS00022, dtype: int64

### EGF2 (PS01186)

In [40]:
PS01186 = read_uniprot_list("PS01186.txt")
df["PS01186"] = np.where(df["Uniprot"].isin(PS01186), 1, 0)
df["PS01186"].value_counts()

0    20261
1       42
Name: PS01186, dtype: int64

# Annotations (not included with machine learning model)

## Nucleotide-binding proteins
Source: http://biomine.cs.vcu.edu/servers/DRNApred/

In [41]:
with open(data_path + "Features/DRNApred_RNA_Uniprot_human_proteome.txt") as f:  
    RNA_bind = f.read().splitlines() 
with open(data_path + "Features/DRNApred_DNA_Uniprot_human_proteome.txt") as f:  
    DNA_bind = f.read().splitlines() 

In [42]:
df["RNA_binding"] = np.where(df["Uniprot"].isin(RNA_bind), 1, 0)
df["DNA_binding"] = np.where(df["Uniprot"].isin(DNA_bind), 1, 0)

## Ectodomain

SheddomeDB: https://doi.org/10.1186/s12859-017-1465-7 (Table S1)


DeepSMP source: http://www.csbg-jlu.info/DeepSMP/browse.php

In [43]:
# add ectodomain annotation as a feature
shed_SheddomeDB = pd.read_csv(data_path + "/Features/Shedding_proteins_SheddomeDB_UniProt.txt", header=None, names=["Uniprot"])
df["Ectodomain_shedding"] = np.where(df["Uniprot"].isin(shed_SheddomeDB["Uniprot"]), 1, 0)

shed_DeepSMP = pd.read_csv(data_path + "/Features/Known_shedding_proteins_DeepSMP.txt", sep="\t")
df["Ectodomain_shedding"] = np.where(df["Uniprot"].isin(shed_DeepSMP["Uniprot ID"]), 1, df["Ectodomain_shedding"])

## EV

In [44]:
ev_filtered = pd.read_csv(data_path + "Features/EV_proteome_filtered.csv")
ev_filtered = ev_filtered[ev_filtered["EV"] == 1]["id"]

In [45]:
df["EV"] = np.where(df["Uniprot"].isin(ev_filtered), 1, 0)

## CSF presence (Label)

In [46]:
df["CSF"] = np.where(df["Uniprot"].isin(csf["Uniprot"]), 1, -1)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,PS00237,PS00027,PS00028,PS00022,PS01186,RNA_binding,DNA_binding,Ectodomain_shedding,EV,CSF
0,Q9NWW9,MALARPRPRLGDLIEISRFGYAHWAIYVGDGYVVHLAPASEIAGAG...,162,17393.6480,0.129630,0.006173,0.043210,0.049383,0.012346,0.086420,...,0,0,0,0,0,0,0,0,0,-1
1,Q99569,MPAPEQASLVEEGQPQTRQEAASTGPGMEPETTATTILASVKEQEL...,1192,131866.7406,0.064597,0.011745,0.046980,0.050336,0.015940,0.061242,...,0,0,0,0,0,0,0,0,1,-1
2,Q02325,MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGPSLFSVTKKQLGAGSR...,96,10970.5316,0.062500,0.041667,0.041667,0.114583,0.052083,0.052083,...,0,0,0,0,0,0,0,0,0,-1
3,Q6UXB8,MHGSCSFLMLLLPLLLLLVATTGPVGALTDEEKRLMVELHNLYRAQ...,463,49470.5083,0.097192,0.025918,0.034557,0.084233,0.021598,0.071274,...,0,0,0,0,0,0,0,0,0,1
4,P62195,MALDGPEQMELEEGKAGSGLRQYYLSKIEELQLIVNDKSQNLRRLQ...,406,45625.5008,0.061576,0.007389,0.054187,0.093596,0.022167,0.071429,...,0,0,0,0,0,0,0,0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20298,Q96T66,MKSRIPVVLLACGSFNPITNMHLRMFEVARDHLHQTGMYQVIQGII...,252,28321.3452,0.071429,0.011905,0.043651,0.047619,0.019841,0.067460,...,0,0,0,0,0,0,0,0,0,-1
20299,Q96NY8,MPLSLGAEMWGPEAWLLLLLLLASFTGRCPAGELETSDVVTVVLGQ...,510,55453.6413,0.064706,0.017647,0.043137,0.084314,0.021569,0.086275,...,0,0,0,0,0,0,0,1,1,1
20300,P07197,MSYTLDSLGNPSAYRRVTETRSSFSRVSGSPSSGFRSQSWSRGSPS...,916,102470.7832,0.075328,0.001092,0.038210,0.201965,0.012009,0.055677,...,0,0,0,0,0,0,0,0,1,1
20301,Q69YL0,MVLRRLLAALLHSPQLVERLSESRPIRRAAQLTAFALLQAQLRGQD...,99,10890.4720,0.141414,0.010101,0.030303,0.040404,0.030303,0.090909,...,0,0,0,0,0,0,0,0,0,-1


In [47]:
# drop physicochemical features
df.drop(['Polar', 'Neutral', 'Hydrophobic', 'Volume_small', 'Volume_medium', 'Volume_large', 'Polarity_low', 
    'Polarity_medium', 'Polarity_large', 'Polarizability_low', 'Polarizability_medium', 'Polarizability_large',
    'Charge_positive', 'Charge_neutral', 'Charge_negative', 'Buried', 'Exposed', 'Intermediate'], 
    axis=1, inplace=True)

# Mass spectrometry-detectable human proteome

In [48]:
# create list of proteins with evidence on ProteomicsDB
pos = set(ProteomicsDB_evidence_positive.keys())
pos_filtered_low = set([k for k, v in ProteomicsDB_evidence_positive.items() if v > 0])
print("Number of proteins in positive set:", len(pos))
print("Number of proteins in positive set with evidence score of 1 or higher:", len(pos_filtered_low))

Number of proteins in positive set: 18997
Number of proteins in positive set with evidence score of 1 or higher: 16791


In [49]:
# filter feature dataset for MS detected proteins
df_MS_filtered = df[df["Uniprot"].isin(pos_filtered_low)]
df_MS_filtered

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,PS00237,PS00027,PS00028,PS00022,PS01186,RNA_binding,DNA_binding,Ectodomain_shedding,EV,CSF
0,Q9NWW9,MALARPRPRLGDLIEISRFGYAHWAIYVGDGYVVHLAPASEIAGAG...,162,17393.6480,0.129630,0.006173,0.043210,0.049383,0.012346,0.086420,...,0,0,0,0,0,0,0,0,0,-1
1,Q99569,MPAPEQASLVEEGQPQTRQEAASTGPGMEPETTATTILASVKEQEL...,1192,131866.7406,0.064597,0.011745,0.046980,0.050336,0.015940,0.061242,...,0,0,0,0,0,0,0,0,1,-1
3,Q6UXB8,MHGSCSFLMLLLPLLLLLVATTGPVGALTDEEKRLMVELHNLYRAQ...,463,49470.5083,0.097192,0.025918,0.034557,0.084233,0.021598,0.071274,...,0,0,0,0,0,0,0,0,0,1
4,P62195,MALDGPEQMELEEGKAGSGLRQYYLSKIEELQLIVNDKSQNLRRLQ...,406,45625.5008,0.061576,0.007389,0.054187,0.093596,0.022167,0.071429,...,0,0,0,0,0,0,0,0,1,-1
5,Q13523,MAAAETQSLREQPEMEDANSEKSINEENGEVSEDQSQNKHSRHKKK...,1007,116985.4728,0.044687,0.005958,0.069513,0.076465,0.020854,0.042701,...,0,0,0,0,0,1,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20297,Q9NQS3,MARTLRPSPLCPGGGKAQLSSASLLGAGLLLQPPTPPPLLLLLFPL...,549,61001.7320,0.056466,0.016393,0.060109,0.051002,0.038251,0.065574,...,0,0,0,0,0,0,0,0,1,1
20298,Q96T66,MKSRIPVVLLACGSFNPITNMHLRMFEVARDHLHQTGMYQVIQGII...,252,28321.3452,0.071429,0.011905,0.043651,0.047619,0.019841,0.067460,...,0,0,0,0,0,0,0,0,0,-1
20299,Q96NY8,MPLSLGAEMWGPEAWLLLLLLLASFTGRCPAGELETSDVVTVVLGQ...,510,55453.6413,0.064706,0.017647,0.043137,0.084314,0.021569,0.086275,...,0,0,0,0,0,0,0,1,1,1
20300,P07197,MSYTLDSLGNPSAYRRVTETRSSFSRVSGSPSSGFRSQSWSRGSPS...,916,102470.7832,0.075328,0.001092,0.038210,0.201965,0.012009,0.055677,...,0,0,0,0,0,0,0,0,1,1


# Save feature dataframes

In [50]:
# save entire feature data set
df.to_csv(data_path + "Features/df_features.csv", index=False)
print("Number of proteins:", len(df))
print("Number of CSF proteins:", len(df[df["CSF"] == 1]))
print("Number of non-CSF proteins:", len(df[df["CSF"] == -1]))

Number of proteins: 20303
Number of CSF proteins: 5240
Number of non-CSF proteins: 15063


In [51]:
# save entire feature data set
df_MS_filtered.to_csv(data_path + "Features/df_features_MS_filtered.csv", index=False)
print("Number of proteins:", len(df_MS_filtered))
print("Number of CSF proteins:", len(df_MS_filtered[df_MS_filtered["CSF"] == 1]))
print("Number of non-CSF proteins:", len(df_MS_filtered[df_MS_filtered["CSF"] == -1]))

Number of proteins: 16756
Number of CSF proteins: 4973
Number of non-CSF proteins: 11783


## Brain detected

In [52]:
df_brain_detected = df[df["Uniprot"].isin(brain_detected["Uniprot"])]
df_brain_detected.to_csv(data_path + "Features/df_features_brain_detected.csv", index=False)
print("Number of proteins:", len(df_brain_detected))

Number of proteins: 15990


In [53]:
df_brain_detected_MS_filtered = df_MS_filtered[df_MS_filtered["Uniprot"].isin(brain_detected["Uniprot"])]
df_brain_detected_MS_filtered.to_csv(data_path + "Features/df_features_brain_detected_MS_filtered.csv", index=False)
print("Number of proteins:", len(df_brain_detected_MS_filtered))

Number of proteins: 14662


## Brain elevated

In [54]:
# filter for brain elevated proteins (data set to be used for model training)
df_brain_elevated = df[df["Uniprot"].isin(brain_elevated["Uniprot"])]
df_brain_elevated.to_csv(data_path + "Features/df_features_brain_elevated.csv", index=False)
print("Number of proteins:", len(df_brain_elevated))
print("Number of CSF proteins:", len(df_brain_elevated[df_brain_elevated["CSF"] == 1]))
print("Number of non-CSF proteins:", len(df_brain_elevated[df_brain_elevated["CSF"] == -1]))

Number of proteins: 2542
Number of CSF proteins: 952
Number of non-CSF proteins: 1590


In [55]:
# filter for brain elevated proteins (data set to be used for model training)
df_brain_elevated_MS_filtered = df_MS_filtered[df_MS_filtered["Uniprot"].isin(brain_elevated["Uniprot"])]
df_brain_elevated_MS_filtered.to_csv(data_path + "Features/df_features_brain_elevated_MS_filtered.csv", index=False)
print("Number of proteins:", len(df_brain_elevated_MS_filtered))
print("Number of CSF proteins:", len(df_brain_elevated_MS_filtered[df_brain_elevated_MS_filtered["CSF"] == 1]))
print("Number of non-CSF proteins:", len(df_brain_elevated_MS_filtered[df_brain_elevated_MS_filtered["CSF"] == -1]))

Number of proteins: 2079
Number of CSF proteins: 892
Number of non-CSF proteins: 1187


## Brain detected without training & testing data

In [56]:
# filter for brain detected proteins not found in brain elevated set (to be used for biomarker discovery)
brain_detected_val = set(df_brain_detected["Uniprot"]) - set(df_brain_elevated["Uniprot"])

df_brain_detected_val = df[df["Uniprot"].isin(brain_detected_val)]
df_brain_detected_val.to_csv(data_path + "Features/df_features_brain_detected_val.csv", index=False)
print("Number of proteins:", len(df_brain_detected_val))

Number of proteins: 13448


In [57]:
# filter for brain detected proteins not found in brain elevated set (to be used for biomarker discovery)
brain_detected_val_MS_filtered = set(df_brain_detected_MS_filtered["Uniprot"]) - set(df_brain_elevated_MS_filtered["Uniprot"])

df_brain_detected_val_MS_filtered = df[df["Uniprot"].isin(brain_detected_val_MS_filtered)]
df_brain_detected_val_MS_filtered.to_csv(data_path + "Features/df_features_brain_detected_val_MS_filtered.csv", index=False)
print("Number of proteins:", len(df_brain_detected_val_MS_filtered))

Number of proteins: 12583
