# Import libraries and data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from utils import keep_first_uniprot

pd.options.mode.chained_assignment = None

In [2]:
data_path = os.getcwd() + "/Datasets/CSF/raw_AD/"

In [3]:
def get_uniprot(string):
    
    try:
        _, uniprot = string.split("|")
    except ValueError:
        _, uniprot, _ = string.split("|")
        
    if "-" in uniprot:
        uniprot, _ = uniprot.split("-")
    
    return uniprot

# Alzheimer's Disease CSF

## Higginbotham2020

Integrated proteomics reveals brain-based cerebrospinal fluid biomarkers in asymptomatic and symptomatic Alzheimer’s disease
https://doi.org/10.1126/sciadv.aaz9360

In [4]:
Higginbotham2020 = pd.read_csv(data_path + "Higginbotham2020.csv", header=None, sep=";", skiprows=6, 
    names=["Uniprot", "Gene Symbol", "Brain Module", "p Value", "BH FDR", "Log2 Difference"])
Higginbotham2020["Uniprot"] = Higginbotham2020["Uniprot"].apply(get_uniprot)
Higginbotham2020.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of unique proteins:", len(set(Higginbotham2020["Uniprot"])))

Number of unique proteins: 2828


## Sathe2019
Quantitative Proteomic Profiling of Cerebrospinal Fluid to Identify Candidate Biomarkers for Alzheimer’s Disease 
https://doi.org/10.1002/prca.201800105

In [5]:
Sathe2019 = pd.read_csv(data_path + "Sathe2019.csv", header=2, sep=";", usecols=["Gene Symbol"])
Sathe2019.columns = ["Gene Names"]
Sathe2019.drop_duplicates(subset=["Gene Names"], inplace=True)

In [6]:
# gene name to Uniprot ID mapping for human proteome
mapping = pd.read_csv(os.getcwd() + "/Datasets/Uniprot/Human_proteome_Uniprot_Gene_name.tsv", sep="\t", header=0, names=["Uniprot", "Gene Names"])

# extract and keep first gene name
mapping["Gene Names"] = mapping["Gene Names"].str.split(" ", expand=True)[0]

In [7]:
Sathe2019_mapped = Sathe2019.merge(mapping, on="Gene Names", how="inner")
Sathe2019_mapped
Sathe2019_mapped.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of unique proteins:", len(set(Sathe2019_mapped["Uniprot"])))

Number of unique proteins: 2310


## Bader2020
Proteome profiling in cerebrospinal fluid reveals novel biomarkers of Alzheimer’s disease https://doi.org/10.15252/msb.20199356

In [8]:
Bader2020 = pd.read_csv(data_path + "Bader2020.csv", header=0, sep=";", skiprows=[1], 
    usecols=["PG.ProteinAccessions (uniprot protein accessions)"])
Bader2020.columns = ["Uniprot"]
Bader2020["Uniprot"] = Bader2020["Uniprot"].apply(keep_first_uniprot, delim=";")
Bader2020.drop_duplicates(inplace=True)
print("Number of unique proteins:", len(set(Bader2020["Uniprot"])))

Number of unique proteins: 1484


## Create AD CSF data set

In [9]:
AD_csf_uniprots = set(Higginbotham2020["Uniprot"]).union(set(Sathe2019_mapped["Uniprot"]), set(Bader2020["Uniprot"]))
AD_csf = pd.DataFrame(AD_csf_uniprots, columns=["Uniprot"])

In [10]:
AD_csf["Higginbotham2020"] = np.where(AD_csf["Uniprot"].isin(set(Higginbotham2020["Uniprot"])), 1, 0)
AD_csf["Sathe2019"] = np.where(AD_csf["Uniprot"].isin(set(Sathe2019_mapped["Uniprot"])), 1, 0)
AD_csf["Bader2020"] = np.where(AD_csf["Uniprot"].isin(set(Bader2020["Uniprot"])), 1, 0)
AD_csf["#Studies"] = AD_csf[["Higginbotham2020", "Sathe2019", "Bader2020"]].sum(axis=1)
AD_csf["#Studies"].value_counts(dropna=False)
AD_csf

Unnamed: 0,Uniprot,Higginbotham2020,Sathe2019,Bader2020,#Studies
0,B1AL88,1,0,0,1
1,P31153,1,1,0,2
2,Q9NPZ5,0,1,0,1
3,P02538,1,1,1,3
4,A6NNC1,0,1,0,1
...,...,...,...,...,...
3816,P45974,1,0,0,1
3817,P55290,1,1,1,3
3818,P02818,1,1,0,2
3819,Q01974,0,1,0,1


In [11]:
print("All Uniprot IDs are unique:", AD_csf["Uniprot"].is_unique)

All Uniprot IDs are unique: True


# Save final data set

In [12]:
AD_csf.to_csv(os.getcwd() + "/Datasets/CSF/AD_CSF.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/AD_CSF_Uniprot.txt", "w") as f:
    for item in AD_csf["Uniprot"]:
        f.write("%s\n" % item)