# Import libraries and data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

pd.options.mode.chained_assignment = None

In [2]:
data_path_AD = os.getcwd() + "/Datasets/CSF/raw_AD/"
data_path_MS = os.getcwd() + "/Datasets/CSF/raw_MS/"

In [3]:
def get_uniprot(string):
    
    try:
        _, uniprot = string.split("|")
    except ValueError:
        _, uniprot, _ = string.split("|")
        
    if "-" in uniprot:
        uniprot, _ = uniprot.split("-")
    
    return uniprot

def get_first_uniprot(string):
    
    if ";" in string:
        uniprot, _ = string.split(";", maxsplit=1)
    else:
        uniprot = string
    
    return uniprot

# Alzheimer's Disease CSF

## Higginbotham2020

In [4]:
Higginbotham2020 = pd.read_csv(data_path_AD + "Higginbotham2020.csv", header=None, sep=";", skiprows=6, 
    names=["Uniprot", "Gene Symbol", "Brain Module", "p Value", "BH FDR", "Log2 Difference"])
Higginbotham2020["Uniprot"] = Higginbotham2020["Uniprot"].apply(get_uniprot)
Higginbotham2020.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of unique proteins:", len(set(Higginbotham2020["Uniprot"])))

Number of unique proteins: 2828


## Sathe2019

In [5]:
Sathe2019 = pd.read_csv(data_path_AD + "Sathe2019.csv", header=2, sep=";", usecols=["Gene Symbol"])
Sathe2019.columns = ["Gene Names"]
Sathe2019.drop_duplicates(subset=["Gene Names"], inplace=True)

In [6]:
# gene name to Uniprot ID mapping for human proteome
mapping = pd.read_csv(os.getcwd() + "/Datasets/Uniprot/Human_proteome_Uniprot_Gene_name.tsv", sep="\t", header=0, names=["Uniprot", "Gene Names"])

# extract and keep first gene name
mapping["Gene Names"] = mapping["Gene Names"].str.split(" ", expand=True)[0]

In [7]:
Sathe2019_mapped = Sathe2019.merge(mapping, on="Gene Names", how="inner")
Sathe2019_mapped
Sathe2019_mapped.drop_duplicates(subset=["Uniprot"], inplace=True)
print("Number of unique proteins:", len(set(Sathe2019_mapped["Uniprot"])))

Number of unique proteins: 2310


## Bader2020

In [8]:
Bader2020 = pd.read_csv(data_path_AD + "Bader2020.csv", header=0, sep=";", skiprows=[1], 
    usecols=["PG.ProteinAccessions (uniprot protein accessions)"])
Bader2020.columns = ["Uniprot"]
Bader2020["Uniprot"] = Bader2020["Uniprot"].apply(get_first_uniprot)
Bader2020.drop_duplicates(inplace=True)
print("Number of unique proteins:", len(set(Bader2020["Uniprot"])))

Number of unique proteins: 1484


## Create AD CSF data set

In [9]:
# AD_csf = Wang2020[["Uniprot"]].merge(Higginbotham2020[["Uniprot"]], how="outer", on="Uniprot")
# AD_csf.merge(Sathe2019_mapped[["Uniprot"]], how="outer", on="Uniprot")
AD_csf = Higginbotham2020[["Uniprot"]].merge(Sathe2019_mapped[["Uniprot"]], how="outer", on="Uniprot")
AD_csf.merge(Bader2020[["Uniprot"]], how="outer", on="Uniprot")

AD_csf["#Studies"] = 0
AD_csf["#Studies"] = np.where(AD_csf["Uniprot"].isin(Higginbotham2020["Uniprot"]), AD_csf["#Studies"]+1, AD_csf["#Studies"])
AD_csf["#Studies"] = np.where(AD_csf["Uniprot"].isin(Sathe2019_mapped["Uniprot"]), AD_csf["#Studies"]+1, AD_csf["#Studies"])
AD_csf["#Studies"] = np.where(AD_csf["Uniprot"].isin(Bader2020["Uniprot"]), AD_csf["#Studies"]+1, AD_csf["#Studies"])

In [10]:
print("All Uniprot IDs are unique:", AD_csf["Uniprot"].is_unique)

All Uniprot IDs are unique: True


# Multiple Sclerosis CSF

## Mosleth2020

In [11]:
Mosleth2021 = pd.read_csv(data_path_MS + "Mosleth2021.csv", header=0, sep=";", skiprows=[0,1,3,4], usecols=["Association "])
Mosleth2021.columns = ["Uniprot"]
Mosleth2021.drop_duplicates(inplace=True)
print("Number of unique proteins:", len(set(Mosleth2021["Uniprot"])))

Number of unique proteins: 779


## Kroksveen2013

In [12]:
Kroksveen2013 = pd.read_csv(data_path_MS + "Kroksveen2013_IPI_to_Uniprot.txt", sep="\t")
Kroksveen2013_filtered = Kroksveen2013[Kroksveen2013["UniProt Accession"] != "-"][["UniProt Accession"]]
Kroksveen2013_filtered.columns = ["Uniprot"]
Kroksveen2013_filtered.drop_duplicates(inplace=True)
Kroksveen2013_filtered
print("Number of unique proteins:", len(set(Kroksveen2013_filtered["Uniprot"])))

Number of unique proteins: 1034


## Stoop2013

In [13]:
Stoop2013 = pd.read_csv(data_path_MS + "Stoop2013.csv", sep=";", header=2, usecols=["Protein"])
Stoop2013.columns = ["Uniprot"]
Stoop2013.drop_duplicates(inplace=True)

## Create MS CSF data set

In [14]:
MS_csf = Mosleth2021[["Uniprot"]].merge(Kroksveen2013_filtered[["Uniprot"]], how="outer", on="Uniprot")
MS_csf.merge(Stoop2013[["Uniprot"]], how="outer", on="Uniprot")

MS_csf["#Studies"] = 0
MS_csf["#Studies"] = np.where(MS_csf["Uniprot"].isin(Mosleth2021["Uniprot"]), MS_csf["#Studies"]+1, MS_csf["#Studies"])
MS_csf["#Studies"] = np.where(MS_csf["Uniprot"].isin(Kroksveen2013_filtered["Uniprot"]), MS_csf["#Studies"]+1, MS_csf["#Studies"])
MS_csf["#Studies"] = np.where(MS_csf["Uniprot"].isin(Stoop2013["Uniprot"]), MS_csf["#Studies"]+1, MS_csf["#Studies"])

In [15]:
print("All Uniprot IDs are unique:", MS_csf["Uniprot"].is_unique)

All Uniprot IDs are unique: True


# Save final data sets

In [16]:
AD_csf.to_csv(os.getcwd() + "/Datasets/CSF/AD_CSF.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/AD_CSF_Uniprot.txt", "w") as f:
    for item in AD_csf["Uniprot"]:
        f.write("%s\n" % item)

In [17]:
MS_csf.to_csv(os.getcwd() + "/Datasets/CSF/MS_CSF.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/MS_CSF_Uniprot.txt", "w") as f:
    for item in MS_csf["Uniprot"]:
        f.write("%s\n" % item)