# Import libraries and data

In [1]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

from Bio import SeqIO
from utils import keep_first_uniprot, protein_analysis, get_uniprot, get_value

In [2]:
data_path = os.getcwd() + "/Datasets/"

## Curated CSF 1000+ proteins data set

In [3]:
csf = pd.read_csv(data_path + "CSF/csf.csv")
csf

Unnamed: 0,Uniprot,#Peptides_Macron2018A,#Peptides_Macron2020,#Peptides_Zhang2015,#Peptides_Guldbrandsen2014,#Peptides_Macron2018B,#Peptides_Schutzer2010,#Peptides_Pan2007,#Studies
0,Q6K0P9,2.0,,,,,,,1
1,Q9GZZ8,1.0,3.0,,,,,,2
2,P09529,3.0,3.0,4.0,4.0,1.0,,,5
3,P61019,2.0,3.0,,,2.0,,,3
4,Q9GZX9,4.0,4.0,3.0,3.0,4.0,5.0,1.0,7
...,...,...,...,...,...,...,...,...,...
5719,Q9ULJ1,,,,,,,1.0,1
5720,Q9BTA9,,,,,,,1.0,1
5721,Q86VF7,,,,,,,1.0,1
5722,Q8NDV3,,,,,,,1.0,1


## Detected brain proteome Uniprot sequences

In [4]:
df = pd.read_csv(data_path + "Brain/Human_brain_detected_Uniprot_seq.tab", sep="\t", header=0, names=["Uniprot", "Sequence"]) 
# drop entries without sequence (obsolete)
df.dropna(subset=["Sequence"], inplace=True) 
# 3 entries dropped
# drop entries with non-standard amino acids
df = df[df["Sequence"].str.contains("B|U|X") == False] # 1 entry dropped
df

Unnamed: 0,Uniprot,Sequence
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...
...,...,...
16046,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...
16047,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...
16048,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...
16049,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...


## Elevated brain proteome (HPA)

In [5]:
brain_elevated = pd.read_csv(data_path + "Brain/Brain_elevated.csv")
print("Number of brain elevated proteins:", len(brain_elevated))

Number of brain elevated proteins: 2546


# Feature generation

## Sequence length

In [6]:
df["Length"] = df["Sequence"].apply(len)

## Amino acid composition & attributes

In [7]:
# df = df.apply(protein_analysis, seq_col="Sequence", axis=1)

In [8]:
# save or load dataframe
# df.to_csv(data_path + "Features/df_features_PA_brain_detected.csv", index=False)
df = pd.read_csv(data_path + "Features/df_features_PA_brain_detected.csv") 
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Polarity_large,Polarizability_low,Polarizability_medium,Polarizability_large,Charge_positive,Charge_neutral,Charge_negative,Buried,Exposed,Intermediate
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,163289.0320,0.063094,0.016961,0.035957,0.070556,0.042062,0.061737,...,0.318182,0.312754,0.463365,0.223881,0.091588,0.801900,0.106513,0.421303,0.312754,0.287653
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,1454,161104.6038,0.062586,0.017194,0.045392,0.055708,0.046080,0.066713,...,0.297799,0.328061,0.451169,0.220770,0.082531,0.816369,0.101100,0.426410,0.310867,0.292297
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...,353,40498.6979,0.065156,0.022663,0.033994,0.048159,0.059490,0.056657,...,0.286119,0.269122,0.441926,0.288952,0.107649,0.810198,0.082153,0.453258,0.271955,0.291785
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...,546,59573.4321,0.069597,0.027473,0.032967,0.047619,0.038462,0.082418,...,0.250916,0.344322,0.448718,0.206960,0.080586,0.838828,0.080586,0.476190,0.260073,0.305861
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...,672,75143.3063,0.066964,0.014881,0.043155,0.077381,0.041667,0.078869,...,0.316964,0.306548,0.436012,0.257440,0.102679,0.776786,0.120536,0.453869,0.296131,0.254464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16016,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,858,89987.2867,0.097902,0.029138,0.040793,0.047786,0.033800,0.102564,...,0.291375,0.399767,0.406760,0.193473,0.097902,0.813520,0.088578,0.428904,0.298368,0.317016
16017,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,744,83920.0626,0.075269,0.025538,0.048387,0.061828,0.043011,0.038978,...,0.349462,0.288978,0.479839,0.231183,0.098118,0.791667,0.110215,0.448925,0.297043,0.241935
16018,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,572,61276.6742,0.089161,0.036713,0.036713,0.059441,0.036713,0.064685,...,0.309441,0.304196,0.513986,0.181818,0.085664,0.818182,0.096154,0.363636,0.424825,0.353147
16019,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,2961,331071.6366,0.076663,0.029044,0.055724,0.073961,0.038501,0.053023,...,0.340763,0.320162,0.449510,0.230328,0.103681,0.766633,0.129686,0.433637,0.310030,0.254306


## Structural features (NetSurfP-2.0)

In [9]:
nsp_features = pd.read_csv(data_path + "Features/features_human_proteome_no_filtering.csv")
nsp_features = nsp_features[["id", "disorder", "helix", "turn", "sheet"]]
nsp_features.columns = ["Uniprot", "Disorder_NSP", "Helix_NSP", "Turn_NSP", "Sheet_NSP"]

In [10]:
# add structural features to feature dataframe
df = df.merge(nsp_features, on="Uniprot", how="inner")
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Charge_positive,Charge_neutral,Charge_negative,Buried,Exposed,Intermediate,Disorder_NSP,Helix_NSP,Turn_NSP,Sheet_NSP
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,163289.0320,0.063094,0.016961,0.035957,0.070556,0.042062,0.061737,...,0.091588,0.801900,0.106513,0.421303,0.312754,0.287653,0.021031,0.162144,0.435550,0.402307
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,1454,161104.6038,0.062586,0.017194,0.045392,0.055708,0.046080,0.066713,...,0.082531,0.816369,0.101100,0.426410,0.310867,0.292297,0.015131,0.156809,0.440853,0.402338
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...,353,40498.6979,0.065156,0.022663,0.033994,0.048159,0.059490,0.056657,...,0.107649,0.810198,0.082153,0.453258,0.271955,0.291785,0.206799,0.345609,0.521246,0.133144
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...,546,59573.4321,0.069597,0.027473,0.032967,0.047619,0.038462,0.082418,...,0.080586,0.838828,0.080586,0.476190,0.260073,0.305861,0.128205,0.100733,0.553114,0.346154
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...,672,75143.3063,0.066964,0.014881,0.043155,0.077381,0.041667,0.078869,...,0.102679,0.776786,0.120536,0.453869,0.296131,0.254464,0.013393,0.325893,0.464286,0.209821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16015,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,858,89987.2867,0.097902,0.029138,0.040793,0.047786,0.033800,0.102564,...,0.097902,0.813520,0.088578,0.428904,0.298368,0.317016,0.383450,0.129371,0.745921,0.124709
16016,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,744,83920.0626,0.075269,0.025538,0.048387,0.061828,0.043011,0.038978,...,0.098118,0.791667,0.110215,0.448925,0.297043,0.241935,0.038978,0.536290,0.389785,0.073925
16017,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,572,61276.6742,0.089161,0.036713,0.036713,0.059441,0.036713,0.064685,...,0.085664,0.818182,0.096154,0.363636,0.424825,0.353147,0.622378,0.075175,0.821678,0.103147
16018,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,2961,331071.6366,0.076663,0.029044,0.055724,0.073961,0.038501,0.053023,...,0.103681,0.766633,0.129686,0.433637,0.310030,0.254306,0.107396,0.462682,0.431949,0.105370


## Solubility

In [11]:
weights = {"A": 0.8356471476582918, "C": 0.5208088354857734, "U": 0.5208088354857734, "E": 0.9876987431418378,
           "D": 0.9079044671339564, "G": 0.7997168496420723, "F": 0.5849790194237692, "I": 0.6784124413866582,
           "H": 0.8947913996466419, "K": 0.9267104557513497, "L": 0.6554221515081433, "M": 0.6296623675420369, 
           "N": 0.8597433107431216, "Q": 0.789434648348208, "P": 0.8235328714705341, "S": 0.7440908318492778,
           "R": 0.7712466317693457, "T": 0.8096922697856334, "W": 0.6374678690957594, "V": 0.7357837119163659,
           "Y": 0.6112801822947587}

A = 81.0581
B = -62.7775

def sol(seq):
    SWI = np.mean(([weights[i] for i in seq]))
    sol = 1/(1 + np.exp(-(81.0581*SWI + -62.7775)))
    return sol

In [12]:
df["Solubility"] = df["Sequence"].apply(sol)

## Transmembrane prediction

In [13]:
# TMHMM results had to be split up as webserver only allows 10000 sequences at once
tmhmm_A = pd.read_csv(data_path + "Features/TMHMM_results_brain_detected_A.txt", header=None, sep="\t", 
    names=["Uniprot", "Length", "ExpAA", "First60ExpAA", "PredHel", "Topology"])
tmhmm_B = pd.read_csv(data_path + "Features/TMHMM_results_brain_detected_B.txt", header=None, sep="\t", 
    names=["Uniprot", "Length", "ExpAA", "First60ExpAA", "PredHel", "Topology"])

In [14]:
tmhmm_A["Uniprot"] = tmhmm_A["Uniprot"].apply(get_uniprot)
tmhmm_A["Length"] = tmhmm_A["Length"].apply(get_value)
tmhmm_A["ExpAA"] = tmhmm_A["ExpAA"].apply(get_value)
tmhmm_A["First60ExpAA"] = tmhmm_A["First60ExpAA"].apply(get_value)
tmhmm_A["PredHel"] = tmhmm_A["PredHel"].apply(get_value)

tmhmm_B["Uniprot"] = tmhmm_B["Uniprot"].apply(get_uniprot)
tmhmm_B["Length"] = tmhmm_B["Length"].apply(get_value)
tmhmm_B["ExpAA"] = tmhmm_B["ExpAA"].apply(get_value)
tmhmm_B["First60ExpAA"] = tmhmm_B["First60ExpAA"].apply(get_value)
tmhmm_B["PredHel"] = tmhmm_B["PredHel"].apply(get_value)

In [15]:
# combine results into one dataframe
tmhmm = pd.concat([tmhmm_A, tmhmm_B], axis=0)

In [16]:
#### TO DO ####
# check why inner merge leads to missing entries
# why do we not have TMHMM predictions for all brain proteins?

df = df.merge(tmhmm[["Uniprot", "ExpAA", "First60ExpAA", "PredHel"]], on="Uniprot", how="left")
df.fillna(0, inplace=True)
df["PredHel_binary"] = np.where(df["PredHel"] == 0, 0, 1)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Intermediate,Disorder_NSP,Helix_NSP,Turn_NSP,Sheet_NSP,Solubility,ExpAA,First60ExpAA,PredHel,PredHel_binary
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,163289.0320,0.063094,0.016961,0.035957,0.070556,0.042062,0.061737,...,0.287653,0.021031,0.162144,0.435550,0.402307,0.571832,2.37,2.36,0.0,0
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,1454,161104.6038,0.062586,0.017194,0.045392,0.055708,0.046080,0.066713,...,0.292297,0.015131,0.156809,0.440853,0.402338,0.434984,0.05,0.04,0.0,0
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...,353,40498.6979,0.065156,0.022663,0.033994,0.048159,0.059490,0.056657,...,0.291785,0.206799,0.345609,0.521246,0.133144,0.245223,22.85,22.81,1.0,1
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...,546,59573.4321,0.069597,0.027473,0.032967,0.047619,0.038462,0.082418,...,0.305861,0.128205,0.100733,0.553114,0.346154,0.370674,0.03,0.00,0.0,0
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...,672,75143.3063,0.066964,0.014881,0.043155,0.077381,0.041667,0.078869,...,0.254464,0.013393,0.325893,0.464286,0.209821,0.530103,2.07,0.01,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16015,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,858,89987.2867,0.097902,0.029138,0.040793,0.047786,0.033800,0.102564,...,0.317016,0.383450,0.129371,0.745921,0.124709,0.663723,0.00,0.00,0.0,0
16016,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,744,83920.0626,0.075269,0.025538,0.048387,0.061828,0.043011,0.038978,...,0.241935,0.038978,0.536290,0.389785,0.073925,0.447010,1.58,0.00,0.0,0
16017,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,572,61276.6742,0.089161,0.036713,0.036713,0.059441,0.036713,0.064685,...,0.353147,0.622378,0.075175,0.821678,0.103147,0.749494,0.00,0.00,0.0,0
16018,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,2961,331071.6366,0.076663,0.029044,0.055724,0.073961,0.038501,0.053023,...,0.254306,0.107396,0.462682,0.431949,0.105370,0.528970,0.05,0.00,0.0,0


## Subcellular location prediction

In [17]:
deeploc = pd.read_csv(data_path + "Features/DeepLoc_results_human_proteome.txt", sep="\t")
deeploc.rename(columns={"ID":"Uniprot"}, inplace=True)

# retrieve Uniprot ID
deeploc["Uniprot"] = deeploc["Uniprot"].apply(get_uniprot)
deeploc

Unnamed: 0,Uniprot,Location,Membrane,Nucleus,Cytoplasm,Extracellular,Mitochondrion,Cell_membrane,Endoplasmic_reticulum,Plastid,Golgi_apparatus,Lysosome/Vacuole,Peroxisome
0,Q8WZ42,Cytoplasm,0.0054,0.0528,0.9364,0.0001,0.0000,0.0090,0.0001,0.0001,0.0002,0.0008,0.0006
1,Q8WXI7,Cytoplasm,0.2836,0.1055,0.6656,0.0005,0.0002,0.2102,0.0101,0.0001,0.0041,0.0026,0.0012
2,Q8NF91,Cytoplasm,0.1989,0.0707,0.5495,0.0002,0.0008,0.0319,0.0229,0.0001,0.2861,0.0377,0.0001
3,Q7Z5P9,Extracellular,0.0550,0.0407,0.1452,0.7810,0.0033,0.0277,0.0007,0.0000,0.0002,0.0011,0.0000
4,Q5VST9,Cytoplasm,0.0108,0.0871,0.8976,0.0002,0.0000,0.0140,0.0001,0.0000,0.0002,0.0003,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20371,P02729,Extracellular,0.0006,0.0031,0.0268,0.9364,0.0315,0.0000,0.0000,0.0022,0.0000,0.0001,0.0000
20372,P0DOY5,Extracellular,0.1216,0.0002,0.0047,0.7318,0.1119,0.0346,0.0005,0.0144,0.0011,0.1006,0.0002
20373,P01858,Extracellular,0.0399,0.2627,0.0592,0.6342,0.0264,0.0119,0.0008,0.0000,0.0047,0.0000,0.0000
20374,P0DPI4,Extracellular,0.3387,0.0396,0.0151,0.4207,0.1285,0.2067,0.0033,0.0027,0.0677,0.1158,0.0000


In [18]:
deeploc["Location"].value_counts(dropna=False)

Cytoplasm                5602
Nucleus                  5430
Cell_membrane            3538
Extracellular            2034
Mitochondrion            1563
Endoplasmic_reticulum    1342
Golgi_apparatus           454
Peroxisome                160
Lysosome/Vacuole          136
Plastid                   117
Name: Location, dtype: int64

In [19]:
# add subcellular locations as binary features
for i in deeploc["Location"].unique():
    deeploc_subset = deeploc[deeploc["Location"] == i]
    df[i] = np.where(df["Uniprot"].isin(deeploc_subset["Uniprot"]), 1, 0)

##  Domains

In [20]:
def read_uniprot_list(file):
    file = open(data_path + "Features/" + file, "r")
    lines = file.readlines()
    uniprots = []
    
    for line in lines:
        line_strip = line.strip()
        uniprots.append(line_strip)
        
    return uniprots  

### Cadherin-1 (PS00232)

In [21]:
PS00232 = read_uniprot_list("PS00232.txt")
df["PS00232"] = np.where(df["Uniprot"].isin(PS00232), 1, 0)
df["PS00232"].value_counts()

0    15970
1       50
Name: PS00232, dtype: int64

### G-protein receptor F1 (PS00237)

In [22]:
PS00237 = read_uniprot_list("PS00237.txt")
df["PS00237"] = np.where(df["Uniprot"].isin(PS00237), 1, 0)
df["PS00237"].value_counts()

0    15942
1       78
Name: PS00237, dtype: int64

### Homeobox (PS00027)

In [23]:
PS00027 = read_uniprot_list("PS00027.txt")
df["PS00027"] = np.where(df["Uniprot"].isin(PS00027), 1, 0)
df["PS00027"].value_counts()

0    15960
1       60
Name: PS00027, dtype: int64

### Zinc Finger C2H2 (PS00028)

In [24]:
PS00028 = read_uniprot_list("PS00028.txt")
df["PS00028"] = np.where(df["Uniprot"].isin(PS00028), 1, 0)
df["PS00028"].value_counts()

0    15965
1       55
Name: PS00028, dtype: int64

### EGF1 (PS00022)

In [25]:
PS00022 = read_uniprot_list("PS00022.txt")
df["PS00022"] = np.where(df["Uniprot"].isin(PS00022), 1, 0)
df["PS00022"].value_counts()

0    15976
1       44
Name: PS00022, dtype: int64

### EGF2 (PS01186)

In [26]:
PS01186 = read_uniprot_list("PS01186.txt")
df["PS01186"] = np.where(df["Uniprot"].isin(PS01186), 1, 0)
df["PS01186"].value_counts()

0    15978
1       42
Name: PS01186, dtype: int64

## Glycosylation prediction

### NetOglyc

In [27]:
#### TO DO ####
# oglyc = pd.read_csv(data_path + "Features/prediction_results.txt")

### NetNglyc

In [28]:
def netNglyc_filter(file, file_name):
    """
    """
    # open results file of netNglyc predictions
    results = open(file, "r")
    lines = results.readlines()
    
    # open new file to save filtered lines to
    filtered_results = open(data_path + "Features/" + file_name + ".txt", "w+")
    
    for line in lines:
        # save relevant lines to new file
        if line[:3] == "sp|":
            filtered_results.writelines(line)
    
    # close file
    filtered_results.close()
    
    return None
    
def split_netNglyc(df):
    """
    """
    string = df[0]
    
    # retrieve information from first column
    name, pos, seq = string.split()

    # retrieve Uniprot ID from description
    uniprot = get_uniprot(name)
    
    df["Uniprot"] = uniprot
    df["Position"] = pos
    df["Sequence"] = seq
    
    # drop old column
    df.drop(columns=[0], axis=1, inplace=True)
    
    # reorder columns
    df = df[["Uniprot", "Position", "Sequence", "Potential", "Jury agreement", "Result"]]
    
    return df

In [29]:
# # filter netNglyc results file
# netNglyc_filter(data_path + "Features/NetNglyc_results_human_proteome.out", "NetNglyc_results_human_proteome_filtered")

In [30]:
# # # create clean dataframe of glycosylation prediction results
# netnglyc = pd.read_csv(data_path + "Features/NetNglyc_results_human_proteome_filtered.txt", sep="\t", header=None) 
# netnglyc.dropna(axis=1, how="all", inplace=True)
# netnglyc.columns = [0, "Potential", "Jury agreement", "Result"]
# netnglyc = netnglyc.apply(split_netNglyc, axis=1)
# netnglyc

In [31]:
# save or load dataframe
# netnglyc.to_csv(data_path + "Features/NetNglyc.csv", index=False)
netnglyc = pd.read_csv(data_path + "Features/NetNglyc.csv") 

In [32]:
# filter for predicted glycosylation sites, "-" means predicted negative site
netnglyc_pos = netnglyc[netnglyc["Result"].str.contains("+++", regex=False)]
netnglyc_pos

Unnamed: 0,Uniprot,Position,Sequence,Potential,Jury agreement,Result
35,Q9ULX7,213,NGSL,0.8029,(9/9),+++
42,Q01518,358,NTTL,0.7677,(9/9),+++
72,Q16787,142,NLTL,0.7580,(9/9),+++
92,P55268,248,NLTR,0.7910,(9/9),+++
102,Q9BYZ2,39,NGTW,0.7504,(9/9),+++
...,...,...,...,...,...,...
23163,P10912,156,NWTL,0.7660,(9/9),+++
23173,Q9NZ52,18,NPSN,0.7531,(9/9),+++
23176,Q9P109,286,NISK,0.7547,(9/9),+++
23199,Q9UGJ1,292,NLTR,0.7553,(9/9),+++


In [33]:
glyc_sites = pd.DataFrame(netnglyc_pos["Uniprot"].value_counts(), index=None).reset_index()
glyc_sites.columns = ["Uniprot", "Glycosylation"]
glyc_sites

Unnamed: 0,Uniprot,Glycosylation
0,P15144,5
1,A8MTY0,4
2,Q8N0Z9,4
3,Q86XK7,4
4,P24821,4
...,...,...
1131,P36897,1
1132,O95049,1
1133,P51786,1
1134,Q9UPU5,1


In [34]:
# # add glycosylation as a binary feature
# df["Glycosylation"] = np.where(df["Uniprot"].isin(glyc_sites["Uniprot"]), 1, 0)

# add number of glycosylation sites as a feature
df = df.merge(glyc_sites, on="Uniprot", how="left")
df.fillna(0, inplace=True)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Golgi_apparatus,Peroxisome,Plastid,PS00232,PS00237,PS00027,PS00028,PS00022,PS01186,Glycosylation
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,163289.0320,0.063094,0.016961,0.035957,0.070556,0.042062,0.061737,...,0,0,0,0,0,0,0,0,0,2.0
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,1454,161104.6038,0.062586,0.017194,0.045392,0.055708,0.046080,0.066713,...,0,0,0,0,0,0,0,0,0,0.0
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...,353,40498.6979,0.065156,0.022663,0.033994,0.048159,0.059490,0.056657,...,1,0,0,0,0,0,0,0,0,0.0
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...,546,59573.4321,0.069597,0.027473,0.032967,0.047619,0.038462,0.082418,...,0,0,0,0,0,0,0,0,0,0.0
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...,672,75143.3063,0.066964,0.014881,0.043155,0.077381,0.041667,0.078869,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16015,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,858,89987.2867,0.097902,0.029138,0.040793,0.047786,0.033800,0.102564,...,0,0,0,0,0,0,0,0,0,0.0
16016,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,744,83920.0626,0.075269,0.025538,0.048387,0.061828,0.043011,0.038978,...,0,0,0,0,0,0,0,0,0,0.0
16017,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,572,61276.6742,0.089161,0.036713,0.036713,0.059441,0.036713,0.064685,...,0,0,0,0,0,0,0,0,0,0.0
16018,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,2961,331071.6366,0.076663,0.029044,0.055724,0.073961,0.038501,0.053023,...,0,0,0,0,0,0,0,0,0,1.0


### GlycoMine

In [35]:
# datasets and thresholds taken from https://glycomine.erc.monash.edu/Lab/GlycoMine/
glycomine_n = pd.read_csv(data_path + "Features/GlycoMine_N_results.txt", sep=" ") 
glycomine_n_pos = glycomine_n[glycomine_n["Value"] > 0.5]

glycomine_o = pd.read_csv(data_path + "Features/GlycoMine_O_results.zip", sep=" ") # text file too big for GitHub
glycomine_o_pos = glycomine_o[glycomine_o["Value"] > 0.502]

glycomine_c = pd.read_csv(data_path + "Features/GlycoMine_C_results.txt", sep=" ") 
glycomine_c_pos = glycomine_c[glycomine_c["Value"] > 0.555]

In [36]:
df["GlycoMine_N"] = np.where(df["Uniprot"].isin(set(glycomine_n_pos["UniProtID"])), 1, 0)
df["GlycoMine_O"] = np.where(df["Uniprot"].isin(set(glycomine_o_pos["UniProtID"])), 1, 0)
df["GlycoMine_C"] = np.where(df["Uniprot"].isin(set(glycomine_c_pos["UniProtID"])), 1, 0)

## GPI-Anchor prediction (NetGPI)

In [37]:
# NetGPI results had to be split up as webserver only allows 5000 sequences at once
netgpi_A = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_A.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_B = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_B.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_C = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_C.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 
netgpi_D = pd.read_csv(data_path + "Features/NetGPI_results_brain_detected_D.txt", sep="\t", header=1,
    names=["Uniprot", "Length", "Result", "Omega-site", "Likelihood", "Amino acid"]) 

In [38]:
# combine results into one dataframe
netgpi = pd.concat([netgpi_A, netgpi_B, netgpi_C, netgpi_D], axis=0)
netgpi["Uniprot"] = netgpi["Uniprot"].apply(get_uniprot)

In [39]:
# keep only proteins predicted to have GPI anchor
netgpi_pos = netgpi[netgpi["Result"] == "GPI-Anchored"]
netgpi_pos

Unnamed: 0,Uniprot,Length,Result,Omega-site,Likelihood,Amino acid
2,P40199,344,GPI-Anchored,320,0.462,G
45,O95170,752,GPI-Anchored,731,0.591,G
135,P55290,713,GPI-Anchored,690,0.572,N
225,P02708,457,GPI-Anchored,414,0.403,N
299,Q14CN2,919,GPI-Anchored,892,0.664,N
...,...,...,...,...,...,...
4473,P61165,79,GPI-Anchored,14,0.253,N
4488,Q9NRX6,74,GPI-Anchored,48,0.428,A
4503,Q9Y6X1,66,GPI-Anchored,30,0.303,N
4518,A0A1B0GW54,56,GPI-Anchored,29,0.358,G


In [40]:
df["GPI-anchor"] = np.where(df["Uniprot"].isin(netgpi_pos["Uniprot"]), 1, 0)

## Signal peptide

In [41]:
signalp = pd.read_csv(data_path + "Features/SignalP_results_human_proteome.txt", sep="\t", index_col=False, header=None, 
    skiprows=2, names=["Uniprot", "Prediction", "Likelihood-Other", "Likelihood-SP", "CS Position"])

# retrieve Uniprot ID
signalp["Uniprot"] = signalp["Uniprot"].apply(get_uniprot)
signalp_pos = signalp[signalp["Prediction"] == "SP"]
signalp_pos

Unnamed: 0,Uniprot,Prediction,Likelihood-Other,Likelihood-SP,CS Position
4,P22223,SP,0.000224,0.999762,CS pos: 24-25. Pr: 0.7575
5,Q9BXJ4,SP,0.000201,0.999764,CS pos: 22-23. Pr: 0.9800
6,P09871,SP,0.000226,0.999694,CS pos: 15-16. Pr: 0.9805
7,Q9ULX7,SP,0.000220,0.999737,CS pos: 18-19. Pr: 0.9241
26,Q16787,SP,0.002463,0.997510,CS pos: 36-37. Pr: 0.7144
...,...,...,...,...,...
20348,Q9HBH1,SP,0.190636,0.809347,CS pos: 23-24. Pr: 0.5940
20358,P98172,SP,0.000280,0.999700,CS pos: 27-28. Pr: 0.9724
20363,P34910,SP,0.260518,0.739443,CS pos: 21-22. Pr: 0.4787
20365,Q96PL5,SP,0.048187,0.951777,CS pos: 29-30. Pr: 0.8838


In [42]:
df["Signal peptide"] = np.where(df["Uniprot"].isin(signalp_pos["Uniprot"]), 1, 0)

## Nucleotide-binding proteins

In [43]:
# downloaded from http://biomine.cs.vcu.edu/servers/DRNApred/
with open(data_path + "Features/DRNApred_RNA_Uniprot_human_proteome.txt") as f:  
    RNA_bind = f.read().splitlines() 
with open(data_path + "Features/DRNApred_DNA_Uniprot_human_proteome.txt") as f:  
    DNA_bind = f.read().splitlines() 

In [44]:
df["RNA_binding"] = np.where(df["Uniprot"].isin(RNA_bind), 1, 0)
df["DNA_binding"] = np.where(df["Uniprot"].isin(DNA_bind), 1, 0)

## Protein-binding prediction

In [45]:
def parse_SCRIBER_results(lines):
    
    uniprots = []
    preds = []
    
    for i in range(0, len(lines), 4):    
        
        # check if lines contains Uniprot header
        assert lines[i][0] == ">", lines[i]
        # add uniprot to list
        uniprot = lines[i][1:].strip()
        uniprots.append(uniprot)
        
        # get protein binding predictions
        prot_bind = lines[i+2].strip()
        # check if lines contains Uniprot header
        assert all(c in "01" for c in prot_bind), i
        # calculate fraction of protein-binding to list
        prot_bind_count = prot_bind.count("1")
        prot_bind_fraction = prot_bind_count / len(prot_bind)
        preds.append(prot_bind_fraction)
        
    return uniprots, preds

In [46]:
# downloaded from http://biomine.cs.vcu.edu/servers/SCRIBER/
with open(data_path + "Features/SCRIBER_results_human_proteome.txt") as f:  
    lines = f.readlines()

uniprots, preds = parse_SCRIBER_results(lines)

In [47]:
prot_bind_df = pd.DataFrame({"Uniprot":uniprots, "Prot_bind":preds})
prot_bind_df.sort_values("Prot_bind")

Unnamed: 0,Uniprot,Prot_bind
16249,P17039,0.0
4405,A0A0B4J234,0.0
4403,P48357,0.0
16769,Q9Y2I7,0.0
13019,A2RUB1,0.0
...,...,...
3030,Q9BYQ5,1.0
16219,Q8N4H5,1.0
7416,Q5VUM1,1.0
7410,Q5H9J7,1.0


In [48]:
df = df.merge(prot_bind_df, on="Uniprot", how="left")
df.fillna(0, inplace=True)
df["Prot_bind_binary"] = np.where(df["Prot_bind"] == 0, 0, 1)

## Ectodomain

In [49]:
# add ectodomain annoation as a feature
shed = pd.read_csv(data_path + "/Features/Shedding_proteins_UniProt.txt", header=None, names=["Uniprot"])
df["Ectodomain_shedding"] = np.where(df["Uniprot"].isin(shed["Uniprot"]), 1, 0)

shed_DeepSMP = pd.read_csv(data_path + "/Features/Known_shedding_proteins_DeepSMP.txt", sep="\t")
df["Ectodomain_shedding"] = np.where(df["Uniprot"].isin(shed_DeepSMP["Uniprot ID"]), 1, df["Ectodomain_shedding"])

## CSF presence (Label)

In [50]:
df["CSF"] = np.where(df["Uniprot"].isin(csf["Uniprot"]), 1, -1)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,GlycoMine_O,GlycoMine_C,GPI-anchor,Signal peptide,RNA_binding,DNA_binding,Prot_bind,Prot_bind_binary,Ectodomain_shedding,CSF
0,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,163289.0320,0.063094,0.016961,0.035957,0.070556,0.042062,0.061737,...,1,0,0,1,0,0,0.000000,0,1,1
1,A8K2U0,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,1454,161104.6038,0.062586,0.017194,0.045392,0.055708,0.046080,0.066713,...,0,0,0,1,0,0,0.000000,0,1,-1
2,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...,353,40498.6979,0.065156,0.022663,0.033994,0.048159,0.059490,0.056657,...,0,0,0,0,0,0,0.031161,1,0,1
3,Q9NRG9,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...,546,59573.4321,0.069597,0.027473,0.032967,0.047619,0.038462,0.082418,...,1,0,0,0,0,0,0.027473,1,0,-1
4,Q86V21,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...,672,75143.3063,0.066964,0.014881,0.043155,0.077381,0.041667,0.078869,...,0,0,0,0,0,0,0.004464,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16015,Q2QGD7,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,858,89987.2867,0.097902,0.029138,0.040793,0.047786,0.033800,0.102564,...,1,0,0,0,0,1,0.009324,1,0,-1
16016,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,744,83920.0626,0.075269,0.025538,0.048387,0.061828,0.043011,0.038978,...,0,0,0,0,0,0,0.000000,0,0,1
16017,Q15942,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,572,61276.6742,0.089161,0.036713,0.036713,0.059441,0.036713,0.064685,...,1,0,0,0,0,0,0.295455,1,0,1
16018,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,2961,331071.6366,0.076663,0.029044,0.055724,0.073961,0.038501,0.053023,...,0,0,0,0,0,0,0.000000,0,0,1


In [51]:
df["CSF"].value_counts()

-1    10944
 1     5076
Name: CSF, dtype: int64

In [52]:
df_pos = df[df["CSF"] == 1]
df_neg = df[df["CSF"] == -1]

# Save feature dataframes

## Brain detected

In [53]:
# save entire feature data set
df.to_csv(data_path + "Features/df_features_brain_detected.csv", index=False)
print("Number of proteins:", len(df))

Number of proteins: 16020


## Brain elevated

In [54]:
# filter for brain elevated proteins (data set to be used for model training)
df_brain_elevated = df[df["Uniprot"].isin(brain_elevated["Uniprot"])]
df_brain_elevated.to_csv(data_path + "Features/df_features_brain_elevated.csv", index=False)
print("Number of proteins:", len(df_brain_elevated))

Number of proteins: 2542


## Brain detected without training & testing data

In [55]:
# filter for brain detected proteins not found in brain elevated set (to be used for biomarker discovery)
brain_detected_val = set(df["Uniprot"]) - set(df_brain_elevated["Uniprot"])
df_brain_detected_val = df[df["Uniprot"].isin(brain_detected_val)]
df_brain_detected_val.to_csv(data_path + "Features/df_features_brain_detected_val.csv", index=False)
print("Number of proteins:", len(df_brain_detected_val))

Number of proteins: 13478
