# Import libraries and data

In [2]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import re
import seaborn as sns

from matplotlib_venn import venn2
from scipy.stats import mannwhitneyu
from sklearn.metrics import ConfusionMatrixDisplay

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [3]:
data_path = os.getcwd() + "/Datasets/"
fig_path = os.getcwd() + "/Figures/"

## Brain proteome (HPA)

In [4]:
def get_uniprot(string):
    try:
        _, uniprot, _ = string.split("|")
    except:
        _, uniprot, _ = string.split("_", maxsplit=2)  
    return uniprot

def keep_first_uniprot(string):
    if "," in string:
        uniprots = string.split(",")
        uniprot1 = uniprots[0]
    else:
        uniprot1 = string
    
    return uniprot1

def get_brain_expression(string):
    # check if expression for multiple tissues is provided
    if ";" in string:
        tissues = string.split(";")
        for t in tissues:
            # keep only information on brain expression
            if "brain" in t:
                brain_string = t 
    else:
        brain_string = string
    
    # extract expression value from string
    _, exp = brain_string.split(" ")
    exp = float(exp)

    return exp

def get_brain_expression_detected(string):
    
    # return None is no information is available
    if type(string) != str:
        return None

    # check if expression for multiple tissues is provided
    tissues = string.split(";")
    for t in tissues:
        # keep only information on brain expression
        if "brain" in t:
            brain_string = t 
        else:
            return None
    
    # extract expression value from string
    _, exp = brain_string.split(" ")
    exp = float(exp)

    return exp

In [5]:
# brain_not_detected = pd.read_csv(data_path + "Brain/HPA_brain_not_detected_version21.tsv", sep="\t", low_memory=False)
# brain_not_detected.dropna(subset=["Uniprot"], inplace=True)
# brain_not_detected.drop_duplicates(subset=["Uniprot"], inplace=True)
# brain_not_detected["Uniprot"] = brain_not_detected["Uniprot"].apply(keep_first_uniprot)
# brain_not_detected

In [6]:
brain_detected = pd.read_csv(data_path + "Brain/Brain_detected.csv")

In [7]:
brain_elevated = pd.read_csv(data_path + "Brain/Brain_elevated.csv")

## Feature data sets

In [8]:
# all human proteins
df = pd.read_csv(data_path + "/Features/df_features.csv")
# all proteins detected in brain
df_detected = pd.read_csv(data_path + "/Features/df_features_brain_detected.csv")
# all proteins detected in brain that were not part of the training and testing (i.e. brain elevated) set
df_val = pd.read_csv(data_path + "/Features/df_features_brain_detected_val.csv")
# all proteins elevated in brain
df_elevated = pd.read_csv(data_path + "/Features/df_features_brain_elevated.csv")

## CSF

In [9]:
csf = pd.read_csv(data_path + "CSF/csf.csv")

## Marker proteins

### BIONDA

In [10]:
dementia_markers_BIONDA = pd.read_csv(data_path + "/Biomarker_discovery/BIONDA_dementia_markers.csv", sep=";")
ND_markers = pd.read_csv(data_path + "/Biomarker_discovery/BIONDA_ND_markers.csv", sep=";") # neurodegenerative disease

### DisGeNet

In [11]:
AD_markers = pd.read_csv(data_path + "/Biomarker_discovery/DisGeNet_AD_markers.tsv", sep="\t")
dementia_markers = pd.read_csv(data_path + "/Biomarker_discovery/DisGeNet_dementia_markers.tsv", sep="\t")

### Bai et al. (2020)

Deep Multilayer Brain Proteomics Identifies Molecular Networks in Alzheimer’s Disease Progression: https://doi.org/10.1016/j.neuron.2019.12.015

In [12]:
Bai2020_brain = pd.read_csv(data_path + "/Biomarker_discovery/Bai2020_protein_list_brain_tissue.txt", header=None, 
    names=["Uniprot"])
Bai2020_brain_biomarkers = pd.read_csv(data_path + "/Biomarker_discovery/Bai2020_protein_list_brain_tissue_biomarker_candidates.txt", 
    header=None, names=["Uniprot"])
Bai2020_CSF = pd.read_csv(data_path + "/Biomarker_discovery/Bai2020_protein_list_CSF.txt", header=None, names=["Uniprot"])
Bai2020_CSF_biomarkers = pd.read_csv(data_path + "/Biomarker_discovery/Bai2020_protein_list_CSF_biomarker_candidates.txt", 
    header=None, names=["Uniprot"])


Bai2020_brain["Uniprot"] = Bai2020_brain["Uniprot"].apply(get_uniprot)
Bai2020_brain_biomarkers["Uniprot"] = Bai2020_brain_biomarkers["Uniprot"].apply(get_uniprot)
Bai2020_CSF["Uniprot"] = Bai2020_CSF["Uniprot"].apply(get_uniprot)
Bai2020_CSF_biomarkers["Uniprot"] = Bai2020_CSF_biomarkers["Uniprot"].apply(get_uniprot)

### Higginbotham et al. (2020)

Integrated proteomics reveals brain-based cerebrospinal fluid biomarkers in asymptomatic and symptomatic Alzheimer’s disease: https://doi.org/10.1126/sciadv.aaz9360

In [13]:
def get_uniprot_alt(string):
    _, uniprot = string.split("|")
    
    if "-" in uniprot:
        uniprot, _ = uniprot.split("-")

    return uniprot

In [14]:
Higginbotham2020_brain = pd.read_csv(data_path + "/Biomarker_discovery/Higginbotham2020_protein_list_brain.txt", header=None, 
    names=["Uniprot"])
Higginbotham2020_CSF = pd.read_csv(data_path + "/Biomarker_discovery/Higginbotham2020_protein_list_CSF.txt", 
    header=None, names=["Uniprot"])

In [15]:
Higginbotham2020_brain["Uniprot"] = Higginbotham2020_brain["Uniprot"].apply(get_uniprot_alt)
Higginbotham2020_CSF["Uniprot"] = Higginbotham2020_CSF["Uniprot"].apply(get_uniprot_alt)

## Model and scaler

In [16]:
with open(os.getcwd() + "/Models/LogisticClassifier_L2.pkl", "rb") as f:  
    lr_l2 = pickle.load(f)
with open(os.getcwd() + "/Models/LogisticClassifier_L2_2plus.pkl", "rb") as f:  
    lr_l2_2plus = pickle.load(f)
with open(os.getcwd() + "/Models/LogisticClassifier_L2_3plus.pkl", "rb") as f:  
    lr_l2_3plus = pickle.load(f)

In [17]:
with open(os.getcwd() + "/Models/Standard_scaler.pkl", "rb") as f:  
    scaler = pickle.load(f)

# Prediction

In [18]:
# define continuous variables
cont = ['Length', 'Molecular weight', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 
        'V', 'W', 'Y', 'Isoelectric point', 'Instability index', 'Polar', 'Neutral', 'Hydrophobic', 'Volume_small', 
        'Volume_medium', 'Volume_large', 'Polarity_low', 'Polarity_medium', 'Polarity_large', 'Polarizability_low', 
        'Polarizability_medium', 'Polarizability_large', 'Charge_positive', 'Charge_neutral', 'Charge_negative', 'Buried',
        'Exposed', 'Intermediate', 'Disorder_NSP', 'Helix_NSP', 'Turn_NSP', 'Sheet_NSP', 'Solubility', 'ExpAA', 
        'First60ExpAA', 'PredHel', 'Glycosylation', 'Prot_bind']

In [19]:
# define variables and target
X = df.drop(["Uniprot", "Sequence", "CSF"], axis=1)
y = df["CSF"]

# preprocess data with same scaler as training data
X_scal = X.copy()
X_scal[cont] = scaler.transform(X_scal[cont])

# lr_l2.classes_ # -1 and 1

KeyError: "['Prot_bind'] not in index"

In [None]:
# calculate probabilities with trained model
proba = lr_l2.predict_proba(X_scal)
proba_2plus = lr_l2_2plus.predict_proba(X_scal)
proba_3plus = lr_l2_3plus.predict_proba(X_scal)

# create dataframe of probabilities
preds = pd.DataFrame(proba, columns=["non_CSF_proba", "CSF_proba"])
preds_2plus = pd.DataFrame(proba_2plus, columns=["non_CSF_proba", "CSF_proba"])
preds_3plus = pd.DataFrame(proba_3plus, columns=["non_CSF_proba", "CSF_proba"])

In [None]:
# concat predicted probabilities to feature dataframe
df_pred = df.copy()
df_pred[["non_CSF_proba", "CSF_proba"]] = preds
df_pred[["non_CSF_proba_2plus", "CSF_proba_2plus"]] = preds_2plus
df_pred[["non_CSF_proba_3plus", "CSF_proba_3plus"]] = preds_3plus


# order dataframe by probabilities
df_pred.sort_values(by="CSF_proba", ascending=False, inplace=True)
# reindex to see protein rank
df_pred.reset_index(inplace=True)

df_pred

In [None]:
df_detected_pred = df_pred[df_pred["Uniprot"].isin(df_detected["Uniprot"])]
print("Number of proteins in brain detected set:", len(df_detected_pred))
df_elevated_pred = df_detected_pred[df_detected_pred["Uniprot"].isin(df_elevated["Uniprot"])]
print("Number of proteins in brain elevated set:", len(df_elevated_pred))
df_val_pred = df_detected_pred[df_detected_pred["Uniprot"].isin(df_val["Uniprot"])]
print("Number of proteins in brain detected validation set:", len(df_val_pred))

In [None]:
# create subsets of datasets based on CSF class annotation
df_detected_CSF = df_detected_pred[df_detected_pred["CSF"] == 1]
df_detected_non_CSF = df_detected_pred[df_detected_pred["CSF"] == -1]

df_val_CSF = df_val_pred[df_val_pred["CSF"] == 1]
df_val_non_CSF = df_val_pred[df_val_pred["CSF"] == -1]

df_elevated_CSF = df_elevated_pred[df_elevated_pred["CSF"] == 1]
df_elevated_non_CSF = df_elevated_pred[df_elevated_pred["CSF"] == -1]

# Examine prediction results

## Check distribution of probabilities across CSF classes

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(10,4), sharey=True)
fig.subplots_adjust(wspace=0.1)

sns.violinplot(x="CSF", y="CSF_proba", data=df_val_pred, palette=["firebrick", "skyblue"], ax=ax[0])
ax[0].set(xlabel="CSF class", ylabel="Probability scores", title="Brain detected validation dataset")
sns.violinplot(x="CSF", y="CSF_proba", data=df_elevated_pred, palette=["firebrick", "skyblue"], ax=ax[1])
ax[1].set(xlabel="CSF class", ylabel=None, title="Brain elevated dataset")

plt.show()
fig.savefig(fig_path + "Probability_scores_brain_val_elevated.png", bbox_inches="tight")

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(10,4), sharey=True)
fig.subplots_adjust(wspace=0.1)

sns.violinplot(x="CSF", y="CSF_proba_2plus", data=df_val_pred, palette=["firebrick", "skyblue"], ax=ax[0])
ax[0].set(xlabel="CSF class", ylabel="Probability scores", title="Brain detected validation dataset")
sns.violinplot(x="CSF", y="CSF_proba_2plus", data=df_elevated_pred, palette=["firebrick", "skyblue"], ax=ax[1])
ax[1].set(xlabel="CSF class", ylabel=None, title="Brain elevated dataset")

plt.show()

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(10,4), sharey=True)
fig.subplots_adjust(wspace=0.1)

sns.violinplot(x="CSF", y="CSF_proba_3plus", data=df_val_pred, palette=["firebrick", "skyblue"], ax=ax[0])
ax[0].set(xlabel="CSF class", ylabel="Probability scores", title="Brain detected validation dataset")
sns.violinplot(x="CSF", y="CSF_proba_3plus", data=df_elevated_pred, palette=["firebrick", "skyblue"], ax=ax[1])
ax[1].set(xlabel="CSF class", ylabel=None, title="Brain elevated dataset")

plt.show()

## Prediction confidence across number of studies proteins were found in

In [None]:
studies_proba_val = csf[["Uniprot", "#Studies"]].merge(df_val_CSF[["Uniprot", "CSF_proba", "CSF_proba_2plus", 
    "CSF_proba_3plus"]], on="Uniprot", how="inner")
studies_proba_elevated = csf[["Uniprot", "#Studies"]].merge(df_elevated_CSF[["Uniprot", "CSF_proba", "CSF_proba_2plus", 
    "CSF_proba_3plus"]], on="Uniprot", how="inner")

In [None]:
studies_proba_elevated.groupby("#Studies").mean()

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(12,5), sharey=True)
fig.subplots_adjust(wspace=0.1)

#### TO DO ####
# add significance indicators?

sns.violinplot(x="#Studies", y="CSF_proba", data=studies_proba_val, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[0])
ax[0].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel="Predicted probability",
      title="Brain detected validation dataset")
ax[0].axhline(0.5, color="black", linestyle="--")
sns.violinplot(x="#Studies", y="CSF_proba", data=studies_proba_elevated, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[1])
ax[1].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel=None,
      title="Brain elevated dataset")
ax[1].axhline(0.5, color="black", linestyle="--")

plt.show()
fig.savefig(fig_path + "Probability_scores_vs_study_number.png", bbox_inches="tight")

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(12,5), sharey=True)
fig.subplots_adjust(wspace=0.1)

#### TO DO ####
# add significance indicators?

sns.violinplot(x="#Studies", y="CSF_proba_2plus", data=studies_proba_val, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[0])
ax[0].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel="Predicted probability",
      title="Brain detected validation dataset")
ax[0].axvline(0.5)
ax[0].axhline(0.5, color="black", linestyle="--")
sns.violinplot(x="#Studies", y="CSF_proba_2plus", data=studies_proba_elevated, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[1])
ax[1].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel=None,
      title="Brain elevated dataset")
ax[1].axvline(0.5)
ax[1].axhline(0.5, color="black", linestyle="--")

plt.show()

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(12,5), sharey=True)
fig.subplots_adjust(wspace=0.1)

#### TO DO ####
# add significance indicators?

sns.violinplot(x="#Studies", y="CSF_proba_3plus", data=studies_proba_val, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[0])
ax[0].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel="Predicted probability",
      title="Brain detected validation dataset")
ax[0].axvline(1.5)
ax[0].axhline(0.5, color="black", linestyle="--")
sns.violinplot(x="#Studies", y="CSF_proba_3plus", data=studies_proba_elevated, 
    palette=["skyblue", "steelblue", "darkcyan", "darkgreen", "mediumseagreen", "palegreen", "yellowgreen"], ax=ax[1])
ax[1].set(xlabel="Minimum number of studies CSF protein was detected in", ylabel=None,
      title="Brain elevated dataset")
ax[1].axvline(1.5)
ax[1].axhline(0.5, color="black", linestyle="--")

plt.show()

## Compare difference in brain expression levels between annotations and predictions

In [None]:
brain_exp = brain_elevated[["Uniprot", "Brain expression"]]
brain_exp["log2(Brain expression)"] = np.log2(brain_exp["Brain expression"])
brain_exp_CSF = brain_exp.merge(df_detected_pred[["Uniprot", "CSF", "CSF_proba"]], on="Uniprot", how="inner")
brain_exp_CSF["CSF_proba_binary"] = np.where(brain_exp_CSF["CSF_proba"] > 0.5, 1, -1)

In [None]:
# annotations
CSF_annot = brain_exp_CSF[brain_exp_CSF["CSF"] == 1]
non_CSF_annot = brain_exp_CSF[brain_exp_CSF["CSF"] == -1]

_, p_annot = mannwhitneyu(CSF_annot["Brain expression"], non_CSF_annot["Brain expression"])
print("P-value for annotations:", p_annot)

# predictions
CSF_pred = brain_exp_CSF[brain_exp_CSF["CSF_proba_binary"] == 1]
non_CSF_pred = brain_exp_CSF[brain_exp_CSF["CSF_proba_binary"] == -1]

_, p_pred = mannwhitneyu(CSF_pred["Brain expression"], non_CSF_pred["Brain expression"])
print("P-value for predictions:", p_pred)

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 2, figsize=(10,4), sharey=True)
fig.subplots_adjust(wspace=0.1)

sns.violinplot(x="CSF", y="log2(Brain expression)", data=brain_exp_CSF, palette=["firebrick", "skyblue"], ax=ax[0])
ax[0].set(xlabel="Annotated CSF class")
y, h = max(brain_exp_CSF["log2(Brain expression)"]) + 2, 1
ax[0].plot([0, 0, 1, 1], [y, y+h, y+h, y], lw=1.5, c="k")
ax[0].text(.5, y+1.2, p_annot, ha="center", va="bottom", color="k")

sns.violinplot(x="CSF_proba_binary", y="log2(Brain expression)", data=brain_exp_CSF, palette=["firebrick", "skyblue"], 
               ax=ax[1])
ax[1].set(xlabel="Predicted CSF class", ylabel=None, ylim=(-2, 23))
y, h = max(brain_exp_CSF["log2(Brain expression)"]) + 2, 1
ax[1].plot([0, 0, 1, 1], [y, y+h, y+h, y], lw=1.5, color="k")
ax[1].text(0.5, y+1.2, p_pred, ha="center", va="bottom", color="k")

plt.show()
fig.savefig(fig_path + "Violin_brain_expression_vs_CSF_class_pred_annot.png", bbox_inches="tight")

In [None]:
TN = brain_exp_CSF[(brain_exp_CSF["CSF"] == -1) & (brain_exp_CSF["CSF_proba_binary"] == -1)]
N = brain_exp_CSF[(brain_exp_CSF["CSF_proba_binary"] == -1)]
FP = brain_exp_CSF[(brain_exp_CSF["CSF"] == -1) & (brain_exp_CSF["CSF_proba_binary"] == 1)]
TP = brain_exp_CSF[(brain_exp_CSF["CSF"] == 1) & (brain_exp_CSF["CSF_proba_binary"] == 1)]
P = brain_exp_CSF[(brain_exp_CSF["CSF_proba_binary"] == 1)]
FN = brain_exp_CSF[(brain_exp_CSF["CSF"] == 1) & (brain_exp_CSF["CSF_proba_binary"] == -1)]

l = [TN, N, FP, TP, P, FN]

_, p_neg = mannwhitneyu(TN["Brain expression"], FP["Brain expression"])
print("P-value for predictions:", p_neg)

_, p_pos = mannwhitneyu(TP["Brain expression"], FN["Brain expression"])
print("P-value for predictions:", p_pos)

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 1, figsize=(5, 4))

bars = ax.bar(x=["Annotated: non-CSF \n Predicted: non-CSF", "Predicted: non-CSF", "Annotated: non-CSF \n Predicted: CSF", 
    "Annotated: CSF \n Predicted: CSF", "Predicted: CSF", "Annotated: CSF \n Predicted: non-CSF"], 
    height=[np.median(i["log2(Brain expression)"]) for i in l], color=["firebrick", "darkcyan", "skyblue"], edgecolor="black")
ax.bar_label(bars, fmt="%.4f")
ax.set(xlabel="", ylabel="", title=None, ylim=(0,6))
ax.tick_params(axis="x", labelrotation=90)

plt.show()
fig.savefig(fig_path + "Bar_brain_expression_vs_CSF_class_pred_annot.png", bbox_inches="tight")

In [None]:
sns.set(style=("ticks"), font_scale=1)
fig, ax = plt.subplots(1, 1, figsize=(5, 4))

sns.boxplot(y="log2(Brain expression)", x="CSF", hue="CSF_proba_binary", data=brain_exp_CSF, 
    palette=["firebrick", "skyblue"])
ax.legend(title="CSF prediction", loc="upper right")#, bbox_to_anchor=(1.2, 1))
ax.set(xlabel="CSF annotation", ylabel="log2(Brain expression)", 
    title="Protein expression across CSF \n annotation and prediction classes", ylim=(-1,27))
y, h = max(TN["log2(Brain expression)"]) + 1, 1
ax.plot([-0.2, -0.2, 0.2, 0.2], [y, y+h, y+h, y], lw=1.5, color="k")
ax.text(0, y+1.2, "p=0.02", ha="center", va="bottom", color="k")
y, h = max(FN["log2(Brain expression)"]) + 1, 1
ax.plot([0.8, 0.8, 1.2, 1.2], [y, y+h, y+h, y], lw=1.5, color="k")
ax.text(1, y+1.2, "p=0.4692", ha="center", va="bottom", color="k")

plt.show()
# fig.save

# Biomarkers

## Overlap of biomarker ranking with known disease markers

### Dementia  (BIONDA)

In [None]:
# keep only significant associated markers
dementia_markers_sig = dementia_markers_BIONDA[dementia_markers_BIONDA["Score"] < 0.05]
# keep only markers
df_pred_dementia_markers = df_detected_pred[df_detected_pred["Uniprot"].isin(dementia_markers_sig["MarkerID"])] 
print("Number of associated markers:", len(df_pred_dementia_markers))
print("Percent predicted as CSF secreted:", 
    len(df_pred_dementia_markers[df_pred_dementia_markers["CSF_proba"] > 0.5])/len(df_pred_dementia_markers))
print("Percent predicted as non-CSF secreted:", 
    len(df_pred_dementia_markers[df_pred_dementia_markers["CSF_proba"] < 0.5])/len(df_pred_dementia_markers))

fig, ax = plt.subplots(1, 1, figsize=(30,10))
plt.plot(df_pred_dementia_markers.index, np.zeros(len(df_pred_dementia_markers.index)), "--bo")
plt.show()

### Neurodegenerative diseases (BIONDA)

In [None]:
ND_markers_sig = ND_markers[ND_markers["Score"] < 0.05]
df_pred_ND_markers = df_detected_pred[df_detected_pred["Uniprot"].isin(ND_markers_sig["MarkerID"])] 
print("Number of associated markers:", len(df_pred_ND_markers))
print("Percent predicted as CSF secreted:", 
    len(df_pred_ND_markers[df_pred_ND_markers["CSF_proba"] > 0.5])/len(df_pred_ND_markers))
print("Percent predicted as non-CSF secreted:", 
    len(df_pred_ND_markers[df_pred_ND_markers["CSF_proba"] < 0.5])/len(df_pred_ND_markers))

fig, ax = plt.subplots(1, 1, figsize=(30,10))
plt.plot(df_pred_ND_markers.index, np.zeros(len(df_pred_ND_markers.index)), "--bo")
plt.show()

### Alzheimer's Disease (DisGeNet)

In [None]:
AD_markers_sig = AD_markers[AD_markers["Score_gda"] > 0.2]
df_pred_AD_markers = df_detected_pred[df_detected_pred["Uniprot"].isin(AD_markers_sig["UniProt"])] 
print("Number of associated markers:", len(df_pred_AD_markers))
print("Percent predicted as CSF secreted:", 
    len(df_pred_AD_markers[df_pred_AD_markers["CSF_proba"] > 0.5])/len(df_pred_AD_markers))
print("Percent predicted as non-CSF secreted:", 
    len(df_pred_AD_markers[df_pred_AD_markers["CSF_proba"] < 0.5])/len(df_pred_AD_markers))

fig, ax = plt.subplots(1, 1, figsize=(30,10))
plt.plot(df_pred_AD_markers.index, np.zeros(len(df_pred_AD_markers.index)), "--bo")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(30,10))
plt.plot(df_pred_AD_markers.index, df_pred_AD_markers["CSF_proba"], "--bo")
ax.axhline(y=0.5, color="r", linestyle="-")
plt.show() # 0.5 threshold at rank 5943

### Dementia (DisGeNet)

In [None]:
dementia_markers_sig = dementia_markers[dementia_markers["Score_gda"] >= 0.2]
df_pred_dementia_markers = df_detected_pred[df_detected_pred["Uniprot"].isin(dementia_markers_sig["UniProt"])] 
print("Number of dementia markers:", len(df_pred_dementia_markers))
print("Percent predicted as CSF secreted:", 
    len(df_pred_dementia_markers[df_pred_dementia_markers["CSF_proba"] > 0.5])/len(df_pred_dementia_markers))
print("Percent predicted as non-CSF secreted:", 
    len(df_pred_dementia_markers[df_pred_dementia_markers["CSF_proba"] < 0.5])/len(df_pred_dementia_markers))

fig, ax = plt.subplots(1, 1, figsize=(30,10))

plt.plot(df_pred_dementia_markers.index, df_pred_dementia_markers["CSF_proba"], "--bo")
ax.axhline(y=0.5, color="r", linestyle="-")
plt.show() # 0.5 threshold at rank 5943

## Probability scores of known & potential AD CSF biomarkers

Used literature:
- Olsson et al. (2016) CSF and blood biomarkers for the diagnosis of Alzheimer's disease: a systematic review and meta-analysis. https://doi.org/10.1016/S1474-4422(16)00070-3
- Molinuevo et al. (2018) Current state of Alzheimer’s fluid biomarkers. https://doi.org/10.1007/s00401-018-1932-x



In [None]:
biomarkers_uniprots = ["P05067", "P10636", "Q92686", "Q9NZC2", "P07196", "P14136", "P36222", "P09104", "P62760", "P13500"
                      "P05413", "P56817", "P02778", "P60880", "P21579", "Q13148", "P37840"]

fig, ax = plt.subplots(1, 1, figsize=(5,3))

sns.swarmplot(y=df_detected_pred[df_detected_pred["Uniprot"].isin(biomarkers_uniprots)]["CSF_proba"], dodge=True)
ax.set(title="Predicted probability of known and potential AD CSF biomarkers", ylabel="Predicted probability")

plt.show()
fig.savefig(fig_path + "Probability_AD_CSF_biomarkers.png", bbox_inches="tight")

In [None]:
print("Probability score for")
print("\tAPP: %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "P05067"]["CSF_proba"].values[0])
print("\tMAPT: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P10636"]["CSF_proba"].values[0]) 
print("\tNeurogranin: %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "Q92686"]["CSF_proba"].values[0]) 
print("\tTREM2: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "Q9NZC2"]["CSF_proba"].values[0]) 
print("\tNFL: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P07196"]["CSF_proba"].values[0]) 
print("\tGFAP: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P14136"]["CSF_proba"].values[0]) 
print("\tYKL-40: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P36222"]["CSF_proba"].values[0])
print("\tNSE: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P09104"]["CSF_proba"].values[0])
print("\tVLP-1: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P62760"]["CSF_proba"].values[0])
print("\tMCP-1: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P13500"]["CSF_proba"].values[0])
print("\tHFABP: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P05413"]["CSF_proba"].values[0])
print("\tBACE1: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P56817"]["CSF_proba"].values[0])
print("\tIP-10: %.04f" %  df_detected_pred[df_detected_pred["Uniprot"] == "P02778"]["CSF_proba"].values[0])
print("\tSNAP25 %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "P60880"]["CSF_proba"].values[0])
print("\tSynaptotagmin %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "P21579"]["CSF_proba"].values[0])
print("\tTDP-43 %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "Q13148"]["CSF_proba"].values[0]) # No studies on CSF according to reference
print("\tAlpha-synuclein %.04f" % df_detected_pred[df_detected_pred["Uniprot"] == "P37840"]["CSF_proba"].values[0])

### Candidates from MIRIADE consortium

In [None]:
print("Rank of probability for")
print("\tCRH:", df_pred[df_pred["Uniprot"] == "P06850"].index[0])
print("\tMMP1:", df_detected_pred[df_detected_pred["Uniprot"] == "P03956"].index[0])
print("\tGBA:", df_detected_pred[df_detected_pred["Uniprot"] == "P04062"].index[0])

print("\tPEBP1:", df_detected_pred[df_detected_pred["Uniprot"] == "P30086"].index[0])
print("\tSPON1:", df_detected_pred[df_detected_pred["Uniprot"] == "Q9HCB6"].index[0])
print("\tTNFSF13:", df_detected_pred[df_detected_pred["Uniprot"] == "O75888"].index[0])

print("\tVAMP2:", df_detected_pred[df_detected_pred["Uniprot"] == "P63027"].index[0])
print("\tGluR4:", df_detected_pred[df_detected_pred["Uniprot"] == "P48058"].index[0])
print("\tNPTX2:", df_detected_pred[df_detected_pred["Uniprot"] == "P47972"].index[0])
print("\tGAP43:", df_detected_pred[df_detected_pred["Uniprot"] == "P17677"].index[0])

# Investigate false positives

These proteins were predicted to be secreted to the CSF but are annotated not to be. One explanation would be that these proteins are secreted to CSF but in such low concentrations that it is not (easily) possible to detect them in exploratory mass spectrometry studies.

In [None]:
# check brain elevated proteins that have not been found in CSF but are predicted to be CSF secreted
df_elevated_FP = df_elevated_non_CSF[df_elevated_non_CSF["CSF_proba"] > 0.5]
print("Number of false negative proteins in brain elevated dataset:", len(df_elevated_FP))

In [None]:
brain_exp_FP = brain_exp.merge(df_elevated_FP[["Uniprot", "CSF_proba"]], on="Uniprot", how="inner")
brain_exp_CSF = brain_exp.merge(df_elevated_CSF[["Uniprot", "CSF_proba"]], on="Uniprot", how="inner")

print("Average brain expression of false positive proteins in brain elevated dataset: %.04f"
    % np.median(brain_exp_FP["log2(Brain expression)"]))
print("Average brain expression of CSF proteins in brain elevated dataset: %.04f" 
    % np.median(brain_exp_CSF["log2(Brain expression)"]))

In [None]:
df_elevated_FP["log2(Molecular weight)"] = np.log2(df_elevated_FP["Molecular weight"])
df_elevated_CSF["log2(Molecular weight)"] = np.log2(df_elevated_CSF["Molecular weight"])

print("Average false positive protein size in brain elevated dataset: %.04f"
    % np.median(df_elevated_FP["log2(Molecular weight)"]))
print("Average CSF protein size in brain elevated dataset: %.04f" 
    % np.median(df_elevated_CSF["log2(Molecular weight)"]))

## Check false positively predicted proteins for potential biomarkers

### Low brain expression

In [None]:
# how many proteins are confidently predicted to be in CSF and have a lower than average brain expression?
low_exp_biomarker_candidates = brain_exp_FP[(brain_exp_FP["CSF_proba"] > 0.75) & 
    (brain_exp_FP["log2(Brain expression)"] < 5.5329)]
print("Number of lowly expressed biomarker candidates:", len(low_exp_biomarker_candidates))

In [None]:
with open(data_path + "Biomarker_discovery/Biomarker_candidates_low_exp.txt", "w") as f:
    for item in low_exp_biomarker_candidates["Uniprot"]:
        f.write("%s\n" % item)

In [None]:
# set which points to colour based on brain expression and probability cut-off
col = np.where((brain_exp_FP["CSF_proba"] > 0.75) & (brain_exp_FP["log2(Brain expression)"] < 5.5329), "lightseagreen", "grey")

fig, ax = plt.subplots(1, 1, figsize=(5,5))

plt.scatter(x="log2(Brain expression)", y="CSF_proba", data=brain_exp_FP, color=col)

plt.show()
fig.savefig(fig_path + "Scatter_false_positives_brain_expression_probability.png", bbox_inches="tight")

### Low molecular weight

In [None]:
# how many proteins are confidently predicted to be in CSF and have a low molecular weight?
low_mw_biomarker_candidates = df_elevated_FP[(df_elevated_FP["CSF_proba"] > 0.75) & 
    (df_elevated_FP["log2(Molecular weight)"] < 16.0460)]
print("Number of biomarker candidates:", len(low_mw_biomarker_candidates))

In [None]:
with open(data_path + "Biomarker_discovery/Biomarker_candidates_low_mw.txt", "w") as f:
    for item in low_mw_biomarker_candidates["Uniprot"]:
        f.write("%s\n" % item)

In [None]:
# set which points to colour based on brain expression and probability cut-off
col = np.where((df_elevated_FP["CSF_proba"] > 0.75) & (df_elevated_FP["log2(Molecular weight)"] < 16.0460), 
    "lightseagreen", "grey")

fig, ax = plt.subplots(1, 1, figsize=(5,5))

plt.scatter(x="log2(Molecular weight)", y="CSF_proba", data=df_elevated_FP, color=col)

plt.show()
fig.savefig(fig_path + "Scatter_false_positives_molecular_weight_probability.png", bbox_inches="tight")

### Compare protein groups

In [None]:
venn2([set(low_exp_biomarker_candidates["Uniprot"]), set(low_mw_biomarker_candidates["Uniprot"])], 
    ["Low brain expression", "Low molecular weight"])
plt.show()

# Investigate false negatives

These proteins were predicted to not be secreted to the CSF but are annotated to be. One explanation would be that these proteins are actually false positives in the CSF mass spectrometry studies. False positive proteins are likely only found in only one study and with only one peptide.

In [None]:
df_elevated_FN = df_elevated_CSF[df_elevated_CSF["CSF_proba"] < 0.5]
print("Number of false negative proteins in brain elevated dataset:", len(df_elevated_FN))

## Check maximum peptide count of false negatives vs. all positive CSF proteins 

In [None]:
csf["#Peptides_max"] = csf[["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
    "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_Pan2007"]].max(axis=1)

csf_2plus = csf[csf["#Studies"] >= 2]
csf_3plus = csf[csf["#Studies"] >= 3]

### All CSF proteins

In [None]:
df_elevated_FN_peptides = df_elevated_FN[["Uniprot"]].merge(csf[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")
df_elevated_CSF_peptides = df_elevated_CSF[["Uniprot"]].merge(csf[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")

print("Fraction of proteins identified with only one matching peptide in false negatives: %.04f" 
    % (len(df_elevated_FN_peptides[df_elevated_FN_peptides["#Peptides_max"] == 1])/len(df_elevated_FN_peptides)))
print("Fraction of proteins identified with only one matching peptide in all CSF proteins: %.04f" 
    % (len(df_elevated_CSF_peptides[df_elevated_CSF_peptides["#Peptides_max"] == 1])/len(df_elevated_CSF_peptides)))

fig, ax = plt.subplots(1, 2, figsize=(10,5))
plt.subplots_adjust(wspace=0.3)

sns.histplot(data=df_elevated_FN_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[0])
ax[0].set(title="False negative CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")
sns.histplot(data=df_elevated_CSF_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[1])
ax[1].set(title="All CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")

plt.show()
fig.savefig(fig_path + "Hist_cumulative_count_max_peptides.png", bbox_inches="tight")

### 2+ studies CSF proteins

In [None]:
csf_2plus_FN_peptides = df_elevated_FN[["Uniprot"]].merge(csf_2plus[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")
csf_2plus_CSF_peptides = df_elevated_CSF[["Uniprot"]].merge(csf_2plus[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")

print("Fraction of proteins identified with only one matching peptide in false negatives: %.04f" 
    % (len(csf_2plus_FN_peptides[csf_2plus_FN_peptides["#Peptides_max"] == 1])/len(csf_2plus_FN_peptides)))
print("Fraction of proteins identified with only one matching peptide in all CSF proteins: %.04f" 
    % (len(csf_2plus_CSF_peptides[csf_2plus_CSF_peptides["#Peptides_max"] == 1])/len(csf_2plus_CSF_peptides)))

fig, ax = plt.subplots(1, 2, figsize=(10,5))
plt.subplots_adjust(wspace=0.3)

sns.histplot(data=csf_2plus_FN_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[0])
ax[0].set(title="False negative CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")
sns.histplot(data=csf_2plus_CSF_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[1])
ax[1].set(title="All CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")

plt.show()
fig.savefig(fig_path + "Hist_cumulative_count_max_peptides_2plus.png", bbox_inches="tight")

### 3+ studies CSF proteins

In [None]:
csf_3plus_FN_peptides = df_elevated_FN[["Uniprot"]].merge(csf_3plus[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")
csf_3plus_CSF_peptides = df_elevated_CSF[["Uniprot"]].merge(csf_3plus[["Uniprot", "#Peptides_max"]], on="Uniprot", how="inner")

print("Fraction of proteins identified with only one matching peptide in false negatives: %.04f" 
    % (len(csf_3plus_FN_peptides[csf_3plus_FN_peptides["#Peptides_max"] == 1])/len(csf_3plus_FN_peptides)))
print("Fraction of proteins identified with only one matching peptide in all CSF proteins: %.04f" 
    % (len(csf_3plus_CSF_peptides[csf_3plus_CSF_peptides["#Peptides_max"] == 1])/len(csf_3plus_CSF_peptides)))

fig, ax = plt.subplots(1, 2, figsize=(10,5))
plt.subplots_adjust(wspace=0.3)

sns.histplot(data=csf_3plus_FN_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[0])
ax[0].set(title="False negative CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")
sns.histplot(data=csf_3plus_CSF_peptides, x="#Peptides_max", discrete=True, cumulative=True, ax=ax[1])
ax[1].set(title="All CSF proteins", xlabel="Maximum peptide number", ylabel="Cumulative count")

plt.show()
fig.savefig(fig_path + "Hist_cumulative_count_max_peptides_3plus.png", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,5))

CSF_proba_peptides = df_elevated_CSF[["Uniprot", "CSF_proba"]].merge(csf[["Uniprot", "#Peptides_max"]])
sns.scatterplot(y="CSF_proba", x="#Peptides_max", data=CSF_proba_peptides)
ax.set(title="Correlation between probability score and maximum peptide count", 
          xlabel="Maximum number of identified peptides per study", ylabel="Predicted probability")

plt.show()
fig.savefig(fig_path + "Scatter_max_peptides_probability.png", bbox_inches="tight")