# Import libraries and data

In [1]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
# import re
import seaborn as sns

from utils import keep_first_uniprot

pd.options.mode.chained_assignment = None

In [2]:
data_path = os.getcwd() + "/Datasets/CSF/raw/"

## Macron2018A
Deep Dive on the Proteome of Human Cerebrospinal Fluid: A Valuable Data Resource for Biomarker Discovery and Missing Protein Identification
https://doi.org/10.1021/acs.jproteome.8b00300

In [3]:
Macron2018A = pd.read_csv(data_path + "Macron2018A.csv", sep=";", header=1)

In [4]:
Macron2018A = Macron2018A[["Protein Accession Number"]]
Macron2018A.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2018A["#Peptides_Macron2018A"] = Macron2018A.groupby("Uniprot")["Uniprot"].transform("count")
Macron2018A.drop_duplicates(subset=["Uniprot"], inplace=True)
# keep first entry for proteins with more than one associated Uniprot ID
Macron2018A["Uniprot"] = Macron2018A["Uniprot"].apply(keep_first_uniprot)
Macron2018A[:5]

Unnamed: 0,Uniprot,#Peptides_Macron2018A
0,Q6K0P9,2
2,Q9GZZ8,1
3,P09529,3
6,P61019,2
8,Q9GZX9,4


In [5]:
print("Number of unique proteins with at least 1 peptide:", len(Macron2018A[Macron2018A["#Peptides_Macron2018A"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Macron2018A[Macron2018A["#Peptides_Macron2018A"] > 1]))

Number of unique proteins with at least 1 peptide: 3379
Number of unique proteins with at least 2 peptides: 2305


## Macron2020
Exploration of human cerebrospinal fluid: A large proteome dataset revealed by trapped ion mobility time-of-flight mass spectrometry 
https://doi.org/10.1016/j.dib.2020.105704

In [6]:
Macron2020 = pd.read_csv(data_path + "Macron2020.csv", sep=";", header=1, low_memory=False)

In [7]:
Macron2020 = Macron2020[["Protein Accession Number"]]
Macron2020.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2020["#Peptides_Macron2020"] = Macron2020.groupby("Uniprot")["Uniprot"].transform("count")
Macron2020.drop_duplicates(subset=["Uniprot"], inplace=True)
Macron2020.dropna(inplace=True)
# keep first entry for proteins with more than one associated Uniprot ID
Macron2020["Uniprot"] = Macron2020["Uniprot"].apply(keep_first_uniprot)
Macron2020[:5]

Unnamed: 0,Uniprot,#Peptides_Macron2020
0,P61604,7.0
7,Q9HC56,20.0
27,Q13822,52.0
79,Q9HC38,8.0
87,Q9HC36,2.0


In [8]:
print("Number of unique proteins with at least 1 peptide:", len(Macron2020[Macron2020["#Peptides_Macron2020"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Macron2020[Macron2020["#Peptides_Macron2020"] > 1]))

Number of unique proteins with at least 1 peptide: 3174
Number of unique proteins with at least 2 peptides: 3174


## Zhang2015
(Data for) a comprehensive map and functional annotation of the human cerebrospinal fluid proteome
https://doi.org/10.1016/j.dib.2015.02.004

In [9]:
Zhang2015 = pd.read_csv(data_path + "Zhang2015.csv", sep=";") # contains only proteins with at least two peptides identified

In [10]:
Zhang2015 = Zhang2015[["Accession", "Flow-through Proteins", "Original Proteins", "Bound Proteins"]]
# find maximum number of identified peptides
Zhang2015["#Peptides_Zhang2015"] = Zhang2015[["Flow-through Proteins", "Original Proteins", "Bound Proteins"]].max(axis=1)
Zhang2015.drop(labels=["Flow-through Proteins", "Original Proteins", "Bound Proteins"], axis=1, inplace=True)
Zhang2015.columns = ["Uniprot", "#Peptides_Zhang2015"]
Zhang2015[:5]

Unnamed: 0,Uniprot,#Peptides_Zhang2015
0,A1L4H1,17.0
1,A2RU67,8.0
2,A4D0S4,2.0
3,A4D0V7,6.0
4,A6BM72,2.0


In [11]:
print("Number of unique proteins with at least 1 peptide:", len(Zhang2015[Zhang2015["#Peptides_Zhang2015"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Zhang2015[Zhang2015["#Peptides_Zhang2015"] > 1]))

Number of unique proteins with at least 1 peptide: 2513
Number of unique proteins with at least 2 peptides: 2513


## Guldbrandsen2014

In-depth characterization of the cerebrospinal fluid (CSF) proteome displayed through the CSF proteome resource (CSF-PR)
https://doi.org/10.1074/mcp.m114.038554

In [12]:
Guldbrandsen2014a = pd.read_csv(data_path + "Guldbrandsen2014_MM_depleted.csv", sep=",", header=1, index_col=0) 
# mixed mode depleted fraction
Guldbrandsen2014b = pd.read_csv(data_path + "Guldbrandsen2014_MM_bound.csv", sep=",", header=1, index_col=0) 
# mixed mode bound fraction
Guldbrandsen2014c = pd.read_csv(data_path + "Guldbrandsen2014_Gel_depleted.csv", sep=",", header=1, index_col=0) 
# gel depleted fraction
Guldbrandsen2014d = pd.read_csv(data_path + "Guldbrandsen2014_Gel_bound.csv", sep=",", header=1, index_col=0) 
# gel bound fraction
Guldbrandsen2014e = pd.read_csv(data_path + "Guldbrandsen2014_Glyco_MM.csv", sep=",", header=1, index_col=0) 
# glyco mixed mode

In [13]:
def curate_Guldbrandsen(dataset):
    # keep and rename relevant columns
    dataset = dataset[["Accession", "#Peptides", "Validated"]]
    dataset.columns = ["Uniprot", "#Peptides", "Validated"]
    # convert peptide count to integer
    dataset["#Peptides"] = dataset["#Peptides"].astype(int)
    # remove datasets with no associated peptides and non-validated proteins
    dataset = dataset[dataset["#Peptides"] > 0]
    dataset = dataset[dataset["Validated"] == True]
    # remove duplicated Uniprots
    dataset.drop_duplicates(subset=["Uniprot"], inplace=True)
    
    return dataset[["Uniprot", "#Peptides"]]

datasets_Guldbrandsen = [Guldbrandsen2014a, Guldbrandsen2014b, Guldbrandsen2014c, Guldbrandsen2014d, Guldbrandsen2014e]
desc_Guldbrandsen = ["Mixed mode depleted fraction", "Mixed mode bound fraction", "Gel depleted fraction",
    "Gel bound fraction", "Glyco mixed mode fraction"]

Guldbrandsen2014a = curate_Guldbrandsen(Guldbrandsen2014a)
Guldbrandsen2014b = curate_Guldbrandsen(Guldbrandsen2014b)
Guldbrandsen2014c = curate_Guldbrandsen(Guldbrandsen2014c)
Guldbrandsen2014d = curate_Guldbrandsen(Guldbrandsen2014d)
Guldbrandsen2014e = curate_Guldbrandsen(Guldbrandsen2014e)

In [14]:
for i, d in enumerate(datasets_Guldbrandsen):
    print(desc_Guldbrandsen[i])
    print("Number of unique proteins with at least 1 peptide:", len(d[d["#Peptides"] > 0]))
    print("Number of unique proteins with at least 2 peptides:", len(d[d["#Peptides"] > 1]))

Mixed mode depleted fraction
Number of unique proteins with at least 1 peptide: 3086
Number of unique proteins with at least 2 peptides: 2011
Mixed mode bound fraction
Number of unique proteins with at least 1 peptide: 445
Number of unique proteins with at least 2 peptides: 226
Gel depleted fraction
Number of unique proteins with at least 1 peptide: 2282
Number of unique proteins with at least 2 peptides: 1531
Gel bound fraction
Number of unique proteins with at least 1 peptide: 539
Number of unique proteins with at least 2 peptides: 358
Glyco mixed mode fraction
Number of unique proteins with at least 1 peptide: 735
Number of unique proteins with at least 2 peptides: 364


In [15]:
# merge peptide counts of all Guldbrandsen datasets
Guldbrandsen2014_all = Guldbrandsen2014a.merge(Guldbrandsen2014b, how="outer", on="Uniprot", suffixes=("_a", "_b"))
Guldbrandsen2014_all = Guldbrandsen2014_all.merge(Guldbrandsen2014c, how="outer", on="Uniprot")
Guldbrandsen2014_all = Guldbrandsen2014_all.merge(Guldbrandsen2014d, how="outer", on="Uniprot")
Guldbrandsen2014_all = Guldbrandsen2014_all.merge(Guldbrandsen2014e, how="outer", on="Uniprot")
Guldbrandsen2014_all[:5]

Unnamed: 0,Uniprot,#Peptides_a,#Peptides_b,#Peptides_x,#Peptides_y,#Peptides
0,P0C0L5,200.0,,243.0,79.0,
1,P0C0L4,199.0,69.0,242.0,79.0,48.0
2,P00450,174.0,36.0,128.0,19.0,62.0
3,P02751,159.0,19.0,164.0,70.0,26.0
4,P02790,138.0,28.0,112.0,21.0,34.0


In [16]:
# keep only maximum peptide number across all 4 datasets
Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] = Guldbrandsen2014_all[["#Peptides_a", "#Peptides_b", "#Peptides_x", 
    "#Peptides_y", "#Peptides"]].max(axis=1)
Guldbrandsen2014_all.drop(labels=["#Peptides_a", "#Peptides_b", "#Peptides_x", "#Peptides_y", "#Peptides"], axis=1, 
    inplace=True)

In [17]:
print("Number of unique proteins with at least 1 peptide:", 
    len(Guldbrandsen2014_all[Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] > 0]))
print("Number of unique proteins with at least 2 peptides:", 
    len(Guldbrandsen2014_all[Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] > 1]))

Number of unique proteins with at least 1 peptide: 2484
Number of unique proteins with at least 2 peptides: 1944


## Macron2018B
Identification of Missing Proteins in Normal Human Cerebrospinal Fluid https://doi.org/10.1021/acs.jproteome.8b00194

In [18]:
Macron2018B = pd.read_csv(data_path + "Macron2018B.csv", sep=";", header=1)

In [19]:
Macron2018B = Macron2018B[["Protein accession number"]]
Macron2018B.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2018B["#Peptides_Macron2018B"] = Macron2018B.groupby("Uniprot")["Uniprot"].transform("count")
Macron2018B.drop_duplicates(subset=["Uniprot"], inplace=True)
Macron2018B.dropna(inplace=True)
# keep first entry for proteins with more than one associated Uniprot ID
Macron2018B["Uniprot"] = Macron2018B["Uniprot"].apply(keep_first_uniprot)
Macron2018B[:5]

Unnamed: 0,Uniprot,#Peptides_Macron2018B
0,P09529,1.0
1,Q96RQ9,1.0
2,P61019,2.0
4,Q9GZX9,4.0
8,Q9GZX3,1.0


In [20]:
print("Number of unique proteins with at least 1 peptide:", len(Macron2018B[Macron2018B["#Peptides_Macron2018B"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Macron2018B[Macron2018B["#Peptides_Macron2018B"] > 1]))

Number of unique proteins with at least 1 peptide: 2281
Number of unique proteins with at least 2 peptides: 1484


## Schutzer2010
Establishing the Proteome of Normal Human Cerebrospinal Fluid
https://doi.org/10.1371/journal.pone.0010980

In [21]:
Schutzer2010 = pd.read_csv(data_path + "Schutzer2010.csv", sep=";", header=None, names=["IPI"])
# copied out of PDF, not correctly modified

In [22]:
# count number of peptides per Uniprot ID
Schutzer2010["IPI"] = Schutzer2010["IPI"].apply(lambda x: x.split(" ")[0])
# remove rows that do not contain IPI IDs
Schutzer2010 = Schutzer2010[Schutzer2010["IPI"].str.contains("IPI")]
Schutzer2010["#Peptides_Schutzer2010"] = Schutzer2010.groupby("IPI")["IPI"].transform("count")
Schutzer2010.drop_duplicates(subset=["IPI"], inplace=True)
Schutzer2010[:5]

Unnamed: 0,IPI,#Peptides_Schutzer2010
0,IPI00000027,1
3,IPI00000044,5
8,IPI00000070,1
9,IPI00000076,1
10,IPI00000087,2


In [23]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "Schutzer2010_IPI.txt", "w") as f:
    for item in Schutzer2010["IPI"]:
        f.write("%s\n" % item)

Conversion tool: https://biodbnet-abcc.ncifcrf.gov/db/db2db.php

In [24]:
Schutzer2010_mapping = pd.read_csv(data_path + "Schutzer2010_IPI_to_Uniprot.csv", sep=";", header=0, names=["IPI", "Uniprot"])
# drop rows without successful mapping
Schutzer2010_mapping = Schutzer2010_mapping[Schutzer2010_mapping["Uniprot"].str.contains("-") == False]
Schutzer2010_mapping[:5]

Unnamed: 0,IPI,Uniprot
1,IPI00000044,P01127
2,IPI00000070,P01130
3,IPI00000076,P01138
4,IPI00000087,O60939
5,IPI00000104,O60942


In [25]:
# merge mapping table with protein list
Schutzer2010 = Schutzer2010_mapping.merge(Schutzer2010, how="inner", on="IPI")
Schutzer2010.drop(labels="IPI", axis=1, inplace=True)
Schutzer2010.drop_duplicates(subset=["Uniprot"], inplace=True)
Schutzer2010[:5]

Unnamed: 0,Uniprot,#Peptides_Schutzer2010
0,P01127,5
1,P01130,1
2,P01138,1
3,O60939,2
4,O60942,1


In [26]:
print("Number of proteins in data set with at least 1 peptide:", len(Schutzer2010[Schutzer2010["#Peptides_Schutzer2010"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Schutzer2010[Schutzer2010["#Peptides_Schutzer2010"] > 1]))

Number of proteins in data set with at least 1 peptide: 2067
Number of proteins in data set with at least 2 peptides: 1217


## Smaller CSF studies (not included)

### NunezGalindo2015
Proteomics of Cerebrospinal Fluid: Throughput and Robustness Using a Scalable Automated Analysis Pipeline for Biomarker Discovery 
https://doi.org/10.1021/acs.analchem.5b02748

In [27]:
NunezGalindo2015 = pd.read_csv(data_path + "NunezGalindo2015.csv", sep=";", header=143, low_memory=False)

In [28]:
NunezGalindo2015 = NunezGalindo2015[["Protein accession numbers", "Exclusive unique peptide count"]]
NunezGalindo2015.columns = ["Name", "#Peptides_NunezGalindo2015"]
NunezGalindo2015.drop_duplicates(subset=["Name"], inplace=True)
NunezGalindo2015.dropna(inplace=True)

In [29]:
NunezGalindo2015_mapping = pd.read_csv(data_path + "NunezGalindo2015_Name_to_Uniprot.tab", sep="\t", header=0, 
    names=["Name", "Uniprot"])
NunezGalindo2015_mapping[:5]

Unnamed: 0,Name,Uniprot
0,TIMP2_HUMAN,P16035
1,ANGT_HUMAN,P01019
2,PGAM1_HUMAN,P18669
3,SULF2_HUMAN,Q8IWU5
4,ROA1_HUMAN,P09651


In [30]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "NunezGalindo2015_Uniprot_name.txt", "w") as f:
    for item in NunezGalindo2015["Name"]:
        f.write("%s\n" % item)

In [31]:
# merge mapping table with protein list
NunezGalindo2015 = NunezGalindo2015_mapping.merge(NunezGalindo2015, how="inner", on="Name")
NunezGalindo2015.drop(labels="Name", axis=1, inplace=True)
NunezGalindo2015[:5]

Unnamed: 0,Uniprot,#Peptides_NunezGalindo2015
0,P16035,14.0
1,P01019,10.0
2,P18669,1.0
3,Q8IWU5,1.0
4,P09651,2.0


In [32]:
print("Number of unique proteins with at least 1 peptide:", 
      len(NunezGalindo2015[NunezGalindo2015["#Peptides_NunezGalindo2015"] > 0]))
print("Number of unique proteins with at least 2 peptides:", 
      len(NunezGalindo2015[NunezGalindo2015["#Peptides_NunezGalindo2015"] > 1]))

Number of unique proteins with at least 1 peptide: 742
Number of unique proteins with at least 2 peptides: 526


### Zougmann2007

In [33]:
Zougman2007 = pd.read_csv(data_path + "Zougman2007.csv", sep=";", header=0, names=["IPI"])

In [34]:
# count number of peptides per Uniprot ID
Zougman2007["IPI"] = Zougman2007["IPI"].apply(lambda x: x.split(" ")[0])
# remove rows that do not contain IPI IDs
Zougman2007 = Zougman2007[Zougman2007["IPI"].str.contains("IPI")]
Zougman2007["#Peptides_Zougman2007"] = Zougman2007.groupby("IPI")["IPI"].transform("count")
Zougman2007.drop_duplicates(subset=["IPI"], inplace=True)
Zougman2007[:5]

Unnamed: 0,IPI,#Peptides_Zougman2007
0,IPI00000024,2
2,IPI00000137,6
8,IPI00000138,4
12,IPI00000190,2
14,IPI00000265,2


In [35]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "Zougman2007_IPI.txt", "w") as f:
    for item in Zougman2007["IPI"]:
        f.write("%s\n" % item)

Conversion tool: https://biodbnet-abcc.ncifcrf.gov/db/db2db.php

In [36]:
Zougman2007_mapping = pd.read_csv(data_path + "Zougman2007_IPI_to_Uniprot.csv", sep=";", header=0, names=["IPI", "Uniprot"])
# drop rows without successful mapping
Zougman2007_mapping = Zougman2007_mapping[Zougman2007_mapping["Uniprot"].str.contains("-") == False]
Zougman2007_mapping[:5]

Unnamed: 0,IPI,Uniprot
0,IPI00000024,Q08174
1,IPI00000137,Q9UJJ9
2,IPI00000138,P26572
3,IPI00000190,P60033
4,IPI00000265,Q5VUB5


In [37]:
# merge mapping table with protein list
Zougman2007 = Zougman2007_mapping.merge(Zougman2007, how="inner", on="IPI")
Zougman2007.drop(labels="IPI", axis=1, inplace=True)
Zougman2007.drop_duplicates(subset=["Uniprot"], inplace=True)
Zougman2007[:5]

Unnamed: 0,Uniprot,#Peptides_Zougman2007
0,Q08174,2
1,Q9UJJ9,6
2,P26572,4
3,P60033,2
4,Q5VUB5,2


In [38]:
print("Number of unique proteins with at least 1 peptide:", len(Zougman2007[Zougman2007["#Peptides_Zougman2007"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Zougman2007[Zougman2007["#Peptides_Zougman2007"] > 1]))

Number of unique proteins with at least 1 peptide: 623
Number of unique proteins with at least 2 peptides: 572


### Schilde2018
Protein variability in cerebrospinal fluid and its possible implications for neurological protein biomarker research
https://doi.org/10.1371/journal.pone.0206478

In [39]:
Schilde2018 = pd.read_csv(data_path + "Schilde2018.csv", sep=";") 

In [40]:
Schilde2018 = Schilde2018[["Protein IDs", "Unique peptides K1.1", "Unique peptides K1.2", "Unique peptides K1.3", 
                           "Unique peptides K2.1", "Unique peptides K2.2", "Unique peptides K2.3", "Unique peptides K5.1",
                           "Unique peptides K5.2", "Unique peptides K5.3", "Unique peptides K8.1", "Unique peptides K8.2", 
                           "Unique peptides K8.3", "Unique peptides K13.1", "Unique peptides K13.2", "Unique peptides K13.3",
                           "Unique peptides K14.1", "Unique peptides K14.2", "Unique peptides K14.3", "Unique peptides K26.1",
                           "Unique peptides K26.2", "Unique peptides K26.3", "Unique peptides K33.1", "Unique peptides K33.2",
                           "Unique peptides K33.3", "Unique peptides K36.1", "Unique peptides K36.2", "Unique peptides K36.3",
                           "Unique peptides K38.1", "Unique peptides K38.2", "Unique peptides K38.3", "Unique peptides K39.1",
                           "Unique peptides K39.2", "Unique peptides K39.3", "Unique peptides K40.1", "Unique peptides K40.2",
                           "Unique peptides K40.3"]]
# find maximum number of identified peptides
Schilde2018["#Peptides_Schilde2018"] = Schilde2018[["Unique peptides K1.1", "Unique peptides K1.2", "Unique peptides K1.3", 
                           "Unique peptides K2.1", "Unique peptides K2.2", "Unique peptides K2.3", "Unique peptides K5.1",
                           "Unique peptides K5.2", "Unique peptides K5.3", "Unique peptides K8.1", "Unique peptides K8.2", 
                           "Unique peptides K8.3", "Unique peptides K13.1", "Unique peptides K13.2", "Unique peptides K13.3",
                           "Unique peptides K14.1", "Unique peptides K14.2", "Unique peptides K14.3", "Unique peptides K26.1",
                           "Unique peptides K26.2", "Unique peptides K26.3", "Unique peptides K33.1", "Unique peptides K33.2",
                           "Unique peptides K33.3", "Unique peptides K36.1", "Unique peptides K36.2", "Unique peptides K36.3",
                           "Unique peptides K38.1", "Unique peptides K38.2", "Unique peptides K38.3", "Unique peptides K39.1",
                           "Unique peptides K39.2", "Unique peptides K39.3", "Unique peptides K40.1", "Unique peptides K40.2",
                           "Unique peptides K40.3"]].max(axis=1)

Schilde2018 = Schilde2018[["Protein IDs", "#Peptides_Schilde2018"]]
Schilde2018.columns = ["Uniprot", "#Peptides_Schilde2018"]
Schilde2018[:5]

Unnamed: 0,Uniprot,#Peptides_Schilde2018
0,A0A075B6S6,2.0
1,A0A0A0MS15,2.0
2,A0A0B4J1U7,3.0
3,A0A0C4DH38,3.0
4,A0A0C4DH68,2.0


In [41]:
print("Number of unique proteins with at least 1 peptide:", len(Schilde2018[Schilde2018["#Peptides_Schilde2018"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Schilde2018[Schilde2018["#Peptides_Schilde2018"] > 1]))

Number of unique proteins with at least 1 peptide: 610
Number of unique proteins with at least 2 peptides: 610


### Guo2015
A Proteomic Analysis of Individual and Gender Variations in Normal Human Urine and Cerebrospinal Fluid Using iTRAQ Quantification
https://doi.org/10.1371/journal.pone.0133270

In [42]:
# peptide data sets
Guo2015_male = pd.read_csv(data_path + "Guo2015_D.csv", sep=";")
Guo2015_female = pd.read_csv(data_path + "Guo2015_E.csv", sep=";")

In [43]:
def curate_Guo(dataset):
    dataset = dataset[["Accession Numbers", "Sequence"]]
    dataset.columns = ["Uniprot", "Sequence"]
    # count number of peptides per Uniprot ID
    dataset["#Peptides"] = dataset.groupby("Uniprot")["Uniprot"].transform("count")
    dataset.drop(labels="Sequence", axis=1, inplace=True)
    dataset.drop_duplicates(subset=["Uniprot"], inplace=True)
    # drop entries with more than one associated Uniprot ID
    dataset = dataset[dataset["Uniprot"].str.contains(",") == False]
    return dataset

Guo2015_male = curate_Guo(Guo2015_male)
Guo2015_female = curate_Guo(Guo2015_female)

In [44]:
print("Number of unique proteins with at least 1 peptide (male):", len(Guo2015_male[Guo2015_male["#Peptides"] > 0]))
print("Number of unique proteins with at least 1 peptide (female):", len(Guo2015_female[Guo2015_female["#Peptides"] > 0]))

Number of unique proteins with at least 1 peptide (male): 441
Number of unique proteins with at least 1 peptide (female): 429


In [45]:
Guo2015_all = Guo2015_male.merge(Guo2015_female, how="outer", on="Uniprot")
Guo2015_all["#Peptides_Guo2015"] = Guo2015_all[["#Peptides_x", "#Peptides_y"]].max(axis=1)
Guo2015_all.drop(labels=["#Peptides_x", "#Peptides_y"], axis=1, inplace=True)
Guo2015_all[:5]

Unnamed: 0,Uniprot,#Peptides_Guo2015
0,P02768,194.0
1,P02787,158.0
2,P01024,254.0
3,P41222,32.0
4,P01009,76.0


In [46]:
print("Number of unique proteins with at least 1 peptide:", len(Guo2015_all[Guo2015_all["#Peptides_Guo2015"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Guo2015_all[Guo2015_all["#Peptides_Guo2015"] > 1]))

Number of unique proteins with at least 1 peptide: 505
Number of unique proteins with at least 2 peptides: 497


### Stoop2010
Quantitative Proteomics and Metabolomics Analysis of Normal Human Cerebrospinal Fluid Samples
https://doi.org/10.1074/mcp.m900877-mcp200

In [47]:
Stoop2010 = pd.read_csv(data_path + "Stoop2010.csv", sep=";", header=2) 

In [48]:
Stoop2010 = Stoop2010[["Primary accession number", "number of unique peptides"]]
Stoop2010.columns = ["Uniprot", "#Peptides_Stoop2010"]
Stoop2010.dropna(inplace=True)
Stoop2010[:5]

Unnamed: 0,Uniprot,#Peptides_Stoop2010
0,P02768,63.0
1,P01024,62.0
2,P02787,58.0
3,P0C0L4,38.0
4,P05060,28.0


In [49]:
print("Number of unique proteins with at least 1 peptide:", len(Stoop2010[Stoop2010["#Peptides_Stoop2010"] > 0]))
print("Number of unique proteins with at least 2 peptides:", len(Stoop2010[Stoop2010["#Peptides_Stoop2010"] > 1]))

Number of unique proteins with at least 1 peptide: 178
Number of unique proteins with at least 2 peptides: 178


# Create CSF data set

## All CSF studies

In [50]:
all_csf = Macron2018A.merge(Macron2020, how="outer", on="Uniprot")
datasets = [Zhang2015, 
            Guldbrandsen2014_all, 
            Macron2018B, 
            Schutzer2010, 
            NunezGalindo2015, 
            Zougman2007, 
            Schilde2018, 
            Guo2015_all,
            Stoop2010]

for dataset in datasets:
    all_csf = all_csf.merge(dataset, how="outer", on="Uniprot")

all_csf

Unnamed: 0,Uniprot,#Peptides_Macron2018A,#Peptides_Macron2020,#Peptides_Zhang2015,#Peptides_Guldbrandsen2014,#Peptides_Macron2018B,#Peptides_Schutzer2010,#Peptides_NunezGalindo2015,#Peptides_Zougman2007,#Peptides_Schilde2018,#Peptides_Guo2015,#Peptides_Stoop2010
0,Q6K0P9,2.0,,,,,,,,,,
1,Q9GZZ8,1.0,3.0,,,,,,,,,
2,P09529,3.0,3.0,4.0,4.0,1.0,,,,,,
3,P61019,2.0,3.0,,,2.0,,,,,,
4,Q9GZX9,4.0,4.0,3.0,3.0,4.0,5.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5398,Q15195,,,,,,,,,,,2.0
5399,Q08EQ4,,,,,,,,,,,2.0
5400,P62988,,,,,,,,,,,2.0
5401,A4D1P6,,,,,,,,,,,2.0


In [51]:
print("All Uniprot IDs are unique:", all_csf["Uniprot"].is_unique)

All Uniprot IDs are unique: True


## CSF studies with 1000+ proteins

In [52]:
# keep subset of studies with 1000+ proteins identified (7 studies)
csf = all_csf.loc[:, ["Uniprot", "#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
                   "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010"]]
csf.dropna(how="all", subset=["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
                   "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010"], 
                    inplace=True)

csf

Unnamed: 0,Uniprot,#Peptides_Macron2018A,#Peptides_Macron2020,#Peptides_Zhang2015,#Peptides_Guldbrandsen2014,#Peptides_Macron2018B,#Peptides_Schutzer2010
0,Q6K0P9,2.0,,,,,
1,Q9GZZ8,1.0,3.0,,,,
2,P09529,3.0,3.0,4.0,4.0,1.0,
3,P61019,2.0,3.0,,,2.0,
4,Q9GZX9,4.0,4.0,3.0,3.0,4.0,5.0
...,...,...,...,...,...,...,...
5339,Q9HC84,,,,,,1.0
5340,A8MPX8,,,,,,1.0
5341,Q6ZRF7,,,,,,1.0
5342,Q8IYA2,,,,,,1.0


## Count studies per protein

In [53]:
all_csf["#Studies"] = all_csf[["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
    "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_NunezGalindo2015", 
    "#Peptides_Zougman2007", "#Peptides_Schilde2018", "#Peptides_Guo2015", "#Peptides_Stoop2010"]].count(axis=1)

In [54]:
csf["#Studies"] = csf[["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
    "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010"]].count(axis=1)

# Save final data sets

In [55]:
all_csf.to_csv(os.getcwd() + "/Datasets/CSF/all_csf.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/all_csf_Uniprot.txt", "w") as f:
    for item in all_csf["Uniprot"]:
        f.write("%s\n" % item)

In [56]:
csf.to_csv(os.getcwd() + "/Datasets/CSF/csf.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/csf_Uniprot.txt", "w") as f:
    for item in csf["Uniprot"]:
        f.write("%s\n" % item)