# Import libraries and data

In [1]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import tabula 

from PyPDF2 import PdfFileWriter, PdfFileReader

pd.options.mode.chained_assignment = None

In [2]:
data_path = os.getcwd() + "/Datasets/CSF/raw/"

## Guldbrandsen et al. (2014)

In-depth characterization of the cerebrospinal fluid (CSF) proteome displayed through the CSF proteome resource (CSF-PR)
https://doi.org/10.1074/mcp.m114.038554

In [3]:
Guldbrandsen2014a = pd.read_csv(data_path + "Guldbrandsen2014_MM_depleted.csv", sep=",", header=1, index_col=0, 
    on_bad_lines="skip") # mixed mode depleted fraction
Guldbrandsen2014b = pd.read_csv(data_path + "Guldbrandsen2014_MM_bound.csv", sep=",", header=1, index_col=0, 
    on_bad_lines="skip") # mixed mode bound fraction
Guldbrandsen2014c = pd.read_csv(data_path + "Guldbrandsen2014_Gel_depleted.csv", sep=",", header=1, index_col=0, 
    on_bad_lines="skip") # gel depleted fraction
Guldbrandsen2014d = pd.read_csv(data_path + "Guldbrandsen2014_Gel_bound.csv", sep=",", header=1, index_col=0, 
    on_bad_lines="skip") # gel bound fraction
# some lines are dropped because they raise an error
# some proteins have 0 associated peptides

In [4]:
# drop unnecessary columns and convert peptide count to integer, remove proteins without associated peptides
def curate_Guldbrandsen(dataset):
    dataset = dataset[["Accession", "#Peptides"]]
    dataset.columns = ["Uniprot", "#Peptides"]
    dataset["#Peptides"] = dataset["#Peptides"].astype(int)
    # remove datasets with no associated peptides
    dataset = dataset[dataset["#Peptides"] > 0]
    # remove duplicated Uniprots
    dataset.drop_duplicates(subset=["Uniprot"], inplace=True)
    return dataset

Guldbrandsen2014a = curate_Guldbrandsen(Guldbrandsen2014a)
Guldbrandsen2014b = curate_Guldbrandsen(Guldbrandsen2014b)
Guldbrandsen2014c = curate_Guldbrandsen(Guldbrandsen2014c)
Guldbrandsen2014d = curate_Guldbrandsen(Guldbrandsen2014d)

In [5]:
print("Mixed mode depleted fraction")
print("Number of proteins in data set with at least 1 peptide:", len(Guldbrandsen2014a[Guldbrandsen2014a["#Peptides"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Guldbrandsen2014a[Guldbrandsen2014a["#Peptides"] > 1]))
print("Mixed mode bound fraction")
print("Number of proteins in data set with at least 1 peptide:", len(Guldbrandsen2014b[Guldbrandsen2014b["#Peptides"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Guldbrandsen2014b[Guldbrandsen2014b["#Peptides"] > 1]))
print("Gel depleted fraction")
print("Number of proteins in data set with at least 1 peptide:", len(Guldbrandsen2014c[Guldbrandsen2014c["#Peptides"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Guldbrandsen2014c[Guldbrandsen2014c["#Peptides"] > 1]))
print("Gel bound fraction")
print("Number of proteins in data set with at least 1 peptide:", len(Guldbrandsen2014d[Guldbrandsen2014d["#Peptides"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Guldbrandsen2014d[Guldbrandsen2014d["#Peptides"] > 1]))
# number of proteins with at least two peptides reported in HBFP = 2637

Mixed mode depleted fraction
Number of proteins in data set with at least 1 peptide: 2101
Number of proteins in data set with at least 2 peptides: 1654
Mixed mode bound fraction
Number of proteins in data set with at least 1 peptide: 110
Number of proteins in data set with at least 2 peptides: 108
Gel depleted fraction
Number of proteins in data set with at least 1 peptide: 1482
Number of proteins in data set with at least 2 peptides: 1256
Gel bound fraction
Number of proteins in data set with at least 1 peptide: 379
Number of proteins in data set with at least 2 peptides: 274


In [6]:
# merge peptide counts of all 4 datasets
Guldbrandsen2014_all = Guldbrandsen2014a.merge(Guldbrandsen2014b, how="outer", on="Uniprot", suffixes=("_a", "_b"))
Guldbrandsen2014_all = Guldbrandsen2014_all.merge(Guldbrandsen2014c, how="outer", on="Uniprot")
Guldbrandsen2014_all = Guldbrandsen2014_all.merge(Guldbrandsen2014d, how="outer", on="Uniprot")
Guldbrandsen2014_all

Unnamed: 0,Uniprot,#Peptides_a,#Peptides_b,#Peptides_x,#Peptides_y
0,P0C0L5,200.0,,243.0,79.0
1,P0C0L4,199.0,69.0,242.0,79.0
2,P00450,174.0,36.0,128.0,19.0
3,P02751,159.0,19.0,164.0,70.0
4,P02790,138.0,28.0,112.0,21.0
...,...,...,...,...,...
2277,B9A064,,,,20.0
2278,Q8NE71,,,,1.0
2279,Q86VX9,,,,1.0
2280,P83593,,,,1.0


In [7]:
# keep only maximum peptide number across all 4 datasets
Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] = Guldbrandsen2014_all[["#Peptides_a", "#Peptides_b", "#Peptides_x", 
    "#Peptides_y"]].max(axis=1)
Guldbrandsen2014_all.drop(labels=["#Peptides_a", "#Peptides_b", "#Peptides_x", "#Peptides_y"], axis=1, inplace=True)
Guldbrandsen2014_all

Unnamed: 0,Uniprot,#Peptides_Guldbrandsen2014
0,P0C0L5,243.0
1,P0C0L4,242.0
2,P00450,174.0
3,P02751,164.0
4,P02790,138.0
...,...,...
2277,B9A064,20.0
2278,Q8NE71,1.0
2279,Q86VX9,1.0
2280,P83593,1.0


In [8]:
print("Number of proteins in data set with at least 1 peptide:", 
      len(Guldbrandsen2014_all[Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", 
      len(Guldbrandsen2014_all[Guldbrandsen2014_all["#Peptides_Guldbrandsen2014"] > 1]))

Number of proteins in data set with at least 1 peptide: 2282
Number of proteins in data set with at least 2 peptides: 1810


## Guo et al. (2015)
A Proteomic Analysis of Individual and Gender Variations in Normal Human Urine and Cerebrospinal Fluid Using iTRAQ Quantification
https://doi.org/10.1371/journal.pone.0133270

In [9]:
# peptide data sets
Guo2015_male = pd.read_csv(data_path + "Guo2015_D.csv", sep=";")
Guo2015_female = pd.read_csv(data_path + "Guo2015_E.csv", sep=";")

In [10]:
def curate_Guo(dataset):
    dataset = dataset[["Accession Numbers", "Sequence"]]
    dataset.columns = ["Uniprot", "Sequence"]
    # count number of peptides per Uniprot ID
    dataset["#Peptides"] = dataset.groupby("Uniprot")["Uniprot"].transform("count")
    dataset.drop(labels="Sequence", axis=1, inplace=True)
    dataset.drop_duplicates(subset=["Uniprot"], inplace=True)
    # drop entries with more than one associated Uniprot ID
    dataset = dataset[dataset["Uniprot"].str.contains(",") == False]
    return dataset

Guo2015_male = curate_Guo(Guo2015_male)
Guo2015_female = curate_Guo(Guo2015_female)

In [11]:
print("Number of proteins in data set with at least 1 peptide:", len(Guo2015_male[Guo2015_male["#Peptides"] > 0]))
print("Number of proteins in data set with at least 1 peptides:", len(Guo2015_female[Guo2015_female["#Peptides"] > 0]))

Number of proteins in data set with at least 1 peptide: 441
Number of proteins in data set with at least 1 peptides: 429


In [12]:
Guo2015_all = Guo2015_male.merge(Guo2015_female, how="outer", on="Uniprot")
Guo2015_all["#Peptides_Guo2015"] = Guo2015_all[["#Peptides_x", "#Peptides_y"]].max(axis=1)
Guo2015_all.drop(labels=["#Peptides_x", "#Peptides_y"], axis=1, inplace=True)
Guo2015_all

Unnamed: 0,Uniprot,#Peptides_Guo2015
0,P02768,194.0
1,P02787,158.0
2,P01024,254.0
3,P41222,32.0
4,P01009,76.0
...,...,...
500,P53634,4.0
501,Q14767,4.0
502,Q14697,4.0
503,P01617,4.0


In [13]:
print("Number of proteins in data set with at least 1 peptide:", len(Guo2015_all[Guo2015_all["#Peptides_Guo2015"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Guo2015_all[Guo2015_all["#Peptides_Guo2015"] > 1]))

Number of proteins in data set with at least 1 peptide: 505
Number of proteins in data set with at least 2 peptides: 497


## Macron et al. (2018) A
Deep Dive on the Proteome of Human Cerebrospinal Fluid: A Valuable Data Resource for Biomarker Discovery and Missing Protein Identification
https://doi.org/10.1021/acs.jproteome.8b00300

In [14]:
Macron2018A = pd.read_csv(data_path + "Macron2018A.csv", sep=";", header=1)
Macron2018A

Unnamed: 0,Protein Name,Protein Accession Number,Protein Molecular Weight (Da),Peptide Sequence,Previous Amino Acid,Next Amino Acid
0,Pyrin and HIN domain-containing protein 1 OS=H...,Q6K0P9,"55.067,50",GLEVINDYHFR,K,I
1,Pyrin and HIN domain-containing protein 1 OS=H...,Q6K0P9,"55.067,50",PKDIIRRAKKIPK,V,I
2,Extracellular glycoprotein lacritin OS=Homo sa...,Q9GZZ8,"14.246,40",SILLTEQALAK,K,A
3,Inhibin beta B chain OS=Homo sapiens GN=INHBB ...,P09529,"45.121,60",VDGDFLEAVKR,R,H
4,Inhibin beta B chain OS=Homo sapiens GN=INHBB ...,P09529,"45.121,60",VSEIISFAETDGLASSR,R,V
...,...,...,...,...,...,...
21552,Protein shisa-6 homolog OS=Homo sapiens GN=SHI...,Q6ZSJ9,"55.764,70",TLSAGGAAVGGR,R,R
21553,Protein shisa-6 homolog OS=Homo sapiens GN=SHI...,Q6ZSJ9,"55.764,70",VVSPGPENK,K,Y
21554,Glutamate receptor 3 OS=Homo sapiens GN=GRIA3 ...,P42263,"101.159,00",GFSILQAIMEAAVQNNWQVTAR,R,S
21555,Glutamate receptor 3 OS=Homo sapiens GN=GRIA3 ...,P42263,"101.159,00",NTVQEHSAFR,R,F


In [15]:
Macron2018A = Macron2018A[["Protein Accession Number"]]
Macron2018A.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2018A["#Peptides_Macron2018A"] = Macron2018A.groupby("Uniprot")["Uniprot"].transform("count")
Macron2018A.drop_duplicates(subset=["Uniprot"], inplace=True)
# drop entries with more than one associated Uniprot ID
Macron2018A = Macron2018A[Macron2018A["Uniprot"].str.contains(",") == False]
Macron2018A

Unnamed: 0,Uniprot,#Peptides_Macron2018A
0,Q6K0P9,2
2,Q9GZZ8,1
3,P09529,3
6,P61019,2
8,Q9GZX9,4
...,...,...
21539,P14207,7
21546,P02818,2
21548,Q9BUJ0,3
21551,Q6ZSJ9,3


In [16]:
print("Number of proteins in data set with at least 1 peptide:", len(Macron2018A[Macron2018A["#Peptides_Macron2018A"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Macron2018A[Macron2018A["#Peptides_Macron2018A"] > 1]))

Number of proteins in data set with at least 1 peptide: 3349
Number of proteins in data set with at least 2 peptides: 2291


## Pan et al. (2007)
A combined dataset of human cerebrospinal fluid proteins identified by multi-dimensional chromatography and tandem mass spectrometry 
https://doi.org/10.1002/pmic.200600756

In [17]:
Pan2007 = pd.read_csv(data_path + "Pan2007.csv", sep=";", header=None, names=["IPI"])
Pan2007

Unnamed: 0,IPI
0,IPI00000024 Splice Isoform 1 Of Protocadherin ...
1,IPI00000024 Splice Isoform 1 Of Protocadherin ...
2,IPI00000024 Splice Isoform 1 Of Protocadherin ...
3,IPI00000024 Splice Isoform 1 Of Protocadherin ...
4,IPI00000130 Somatostatin precursor NFFWK 1 102...
...,...
15536,IPI00479911 APG7L protein TLMGWGVR 1 1063.58 0.00
15537,IPI00479977 59 kDa protein ELANWIR 1 1045.58 -...
15538,IPI00479983 Hypothetical protein FLJ46675 QELL...
15539,IPI00480036 Hypothetical protein FLJ32842 DPQS...


In [18]:
# count number of peptides per Uniprot ID
Pan2007["IPI"] = Pan2007["IPI"].apply(lambda x: x.split(" ")[0])
# remove rows that do not contain IPI IDs
Pan2007 = Pan2007[Pan2007["IPI"].str.contains("IPI")]
Pan2007["#Peptides_Pan2007"] = Pan2007.groupby("IPI")["IPI"].transform("count")
Pan2007.drop_duplicates(subset=["IPI"], inplace=True)
Pan2007

Unnamed: 0,IPI,#Peptides_Pan2007
0,IPI00000024,4
4,IPI00000130,5
9,IPI00000137,5
14,IPI00000138,2
16,IPI00000459,3
...,...,...
15536,IPI00479911,1
15537,IPI00479977,1
15538,IPI00479983,1
15539,IPI00480036,1


In [19]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "Pan2007_IPI.txt", "w") as f:
    for item in Pan2007["IPI"]:
        f.write("%s\n" % item)

Conversion tool: https://biodbnet-abcc.ncifcrf.gov/db/db2db.php

In [20]:
Pan2007_mapping = pd.read_csv(data_path + "Pan2007_IPI_to_Uniprot.csv", sep=";", header=0, names=["IPI", "Uniprot"])
# drop rows without successful mapping
Pan2007_mapping = Pan2007_mapping[Pan2007_mapping["Uniprot"].str.contains("-") == False]
Pan2007_mapping

Unnamed: 0,IPI,Uniprot
0,IPI00000024,Q08174
1,IPI00000130,P61278
2,IPI00000137,Q9UJJ9
3,IPI00000138,P26572
4,IPI00000459,O14668
...,...,...
2582,IPI00479483,Q9NQB0
2586,IPI00479643,Q9C0H9
2587,IPI00479786,Q92945
2589,IPI00479911,O95352


In [21]:
# merge mapping table with protein list
Pan2007 = Pan2007_mapping.merge(Pan2007, how="inner", on="IPI")
Pan2007.drop(labels="IPI", axis=1, inplace=True)
Pan2007.drop_duplicates(subset=["Uniprot"], inplace=True)
Pan2007

Unnamed: 0,Uniprot,#Peptides_Pan2007
0,Q08174,4
1,P61278,5
2,Q9UJJ9,5
3,P26572,2
4,O14668,3
...,...,...
1695,O94911,1
1697,Q9C0H9,1
1698,Q92945,1
1699,O95352,1


In [22]:
print("Number of proteins in data set with at least 1 peptide:", len(Pan2007[Pan2007["#Peptides_Pan2007"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Pan2007[Pan2007["#Peptides_Pan2007"] > 1]))

Number of proteins in data set with at least 1 peptide: 1436
Number of proteins in data set with at least 2 peptides: 817


## Schutzer et al. (2010)
Establishing the Proteome of Normal Human Cerebrospinal Fluid
https://doi.org/10.1371/journal.pone.0010980

In [23]:
Schutzer2010 = pd.read_csv(data_path + "Schutzer2010.csv", sep=";", header=None, names=["IPI"])
Schutzer2010

Unnamed: 0,IPI
0,IPI00000027
1,Pituitary adenylate cyclase-activating polypep...
2,precursor R.FPGIRPEEEAYGEDGNPLPDFDGSEPPGAGSPAS...
3,IPI00000044 Platelet-derived growth factor B c...
4,IPI00000044 Platelet-derived growth factor B c...
...,...
33512,IPI00884092 Anti-HER3 scFv (Fragment) S.DIQM*T...
33513,IPI00884353 Ets-1 transcript variant ets-1 del...
33514,IPI00884389 Similar to Immunglobulin heavy cha...
33515,IPI00884389 Similar to Immunglobulin heavy cha...


In [24]:
# count number of peptides per Uniprot ID
Schutzer2010["IPI"] = Schutzer2010["IPI"].apply(lambda x: x.split(" ")[0])
# remove rows that do not contain IPI IDs
Schutzer2010 = Schutzer2010[Schutzer2010["IPI"].str.contains("IPI")]
Schutzer2010["#Peptides_Schutzer2010"] = Schutzer2010.groupby("IPI")["IPI"].transform("count")
Schutzer2010.drop_duplicates(subset=["IPI"], inplace=True)
Schutzer2010

Unnamed: 0,IPI,#Peptides_Schutzer2010
0,IPI00000027,1
3,IPI00000044,5
8,IPI00000070,1
9,IPI00000076,1
10,IPI00000087,2
...,...,...
33504,IPI00884004,1
33505,IPI00884080,5
33510,IPI00884092,3
33513,IPI00884353,1


In [25]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "Schutzer2010_IPI.txt", "w") as f:
    for item in Schutzer2010["IPI"]:
        f.write("%s\n" % item)

Conversion tool: https://biodbnet-abcc.ncifcrf.gov/db/db2db.php

In [26]:
Schutzer2010_mapping = pd.read_csv(data_path + "Schutzer2010_IPI_to_Uniprot.csv", sep=";", header=0, names=["IPI", "Uniprot"])
# drop rows without successful mapping
Schutzer2010_mapping = Schutzer2010_mapping[Schutzer2010_mapping["Uniprot"].str.contains("-") == False]
Schutzer2010_mapping

Unnamed: 0,IPI,Uniprot
1,IPI00000044,P01127
2,IPI00000070,P01130
3,IPI00000076,P01138
4,IPI00000087,O60939
5,IPI00000104,O60942
...,...,...
2608,IPI00878576,Q13822
2615,IPI00879665,Q9BYH1
2618,IPI00880120,Q9BUJ0
2620,IPI00883753,Q92823


In [27]:
# merge mapping table with protein list
Schutzer2010 = Schutzer2010_mapping.merge(Schutzer2010, how="inner", on="IPI")
Schutzer2010.drop(labels="IPI", axis=1, inplace=True)
Schutzer2010.drop_duplicates(subset=["Uniprot"], inplace=True)
Schutzer2010

Unnamed: 0,Uniprot,#Peptides_Schutzer2010
0,P01127,5
1,P01130,1
2,P01138,1
3,O60939,2
4,O60942,1
...,...,...
2097,P55327,2
2098,Q6ZRF7,1
2099,Q8IYA2,1
2102,Q9BUJ0,5


In [28]:
print("Number of proteins in data set with at least 1 peptide:", len(Schutzer2010[Schutzer2010["#Peptides_Schutzer2010"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Schutzer2010[Schutzer2010["#Peptides_Schutzer2010"] > 1]))

Number of proteins in data set with at least 1 peptide: 2067
Number of proteins in data set with at least 2 peptides: 1217


## Macron et al. (2018) B
Identification of Missing Proteins in Normal Human Cerebrospinal Fluid https://doi.org/10.1021/acs.jproteome.8b00194

In [29]:
Macron2018B = pd.read_csv(data_path + "Macron2018B.csv", sep=";", header=1)
Macron2018B

Unnamed: 0,Protein name,Protein accession number,Protein molecular weight (Da),Peptide sequence,Previous amino acid,Next amino acid,Mascot ion score,X! Tandem Hyper Score
0,Inhibin beta B chain OS=Homo sapiens GN=INHBB ...,P09529,"45.121,60",VYFQEQGHGDR,K,W,442,359
1,L-amino-acid oxidase OS=Homo sapiens GN=IL4I1 ...,Q96RQ9,"62.881,30",VTILEADNR,K,I,482,296
2,Ras-related protein Rab-2A OS=Homo sapiens GN=...,P61019,"23.546,20",KEEGEAFAR,K,E,704,345
3,Ras-related protein Rab-2A OS=Homo sapiens GN=...,P61019,"23.546,20",TASNVEEAFINTAK,K,E,134,308
4,Twisted gastrulation protein homolog 1 OS=Homo...,Q9GZX9,"25.015,90",ALCASDVSK,K,C,396,524
...,...,...,...,...,...,...,...,...
12835,,,,,,,,
12836,,,,,,,,
12837,,,,,,,,
12838,,,,,,,,


In [30]:
Macron2018B = Macron2018B[["Protein accession number"]]
Macron2018B.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2018B["#Peptides_Macron2018B"] = Macron2018B.groupby("Uniprot")["Uniprot"].transform("count")
Macron2018B.drop_duplicates(subset=["Uniprot"], inplace=True)
Macron2018B.dropna(inplace=True)
# drop entries with more than one associated Uniprot ID
Macron2018B = Macron2018B[Macron2018B["Uniprot"].str.contains(",") == False]
Macron2018B

Unnamed: 0,Uniprot,#Peptides_Macron2018B
0,P09529,1.0
1,Q96RQ9,1.0
2,P61019,2.0
4,Q9GZX9,4.0
8,Q9GZX3,1.0
...,...,...
12806,P02818,4.0
12810,Q9BUJ0,3.0
12813,Q96RT1,1.0
12814,Q6ZSJ9,3.0


In [31]:
print("Number of proteins in data set with at least 1 peptide:", len(Macron2018B[Macron2018B["#Peptides_Macron2018B"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Macron2018B[Macron2018B["#Peptides_Macron2018B"] > 1]))

Number of proteins in data set with at least 1 peptide: 2245
Number of proteins in data set with at least 2 peptides: 1462


## Macron et al. (2020)
Exploration of human cerebrospinal fluid: A large proteome dataset revealed by trapped ion mobility time-of-flight mass spectrometry 
https://doi.org/10.1016/j.dib.2020.105704

In [32]:
Macron2020 = pd.read_csv(data_path + "Macron2020.csv", sep=";", header=1, low_memory=False)

In [33]:
Macron2020 = Macron2020[["Protein Accession Number"]]
Macron2020.columns = ["Uniprot"]
# count number of peptides per Uniprot ID
Macron2020["#Peptides_Macron2020"] = Macron2020.groupby("Uniprot")["Uniprot"].transform("count")
Macron2020.drop_duplicates(subset=["Uniprot"], inplace=True)
Macron2020.dropna(inplace=True)
# # drop entries with more than one associated Uniprot ID
Macron2020 = Macron2020[Macron2020["Uniprot"].str.contains(",") == False]
Macron2020

Unnamed: 0,Uniprot,#Peptides_Macron2020
0,P61604,7.0
7,Q9HC56,20.0
27,Q13822,52.0
79,Q9HC38,8.0
87,Q9HC36,2.0
...,...,...
27216,Q9NUQ2,2.0
27218,P68366,8.0
27226,P61626,18.0
27244,Q13873,3.0


In [34]:
print("Number of proteins in data set with at least 1 peptide:", len(Macron2020[Macron2020["#Peptides_Macron2020"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Macron2020[Macron2020["#Peptides_Macron2020"] > 1]))

Number of proteins in data set with at least 1 peptide: 3174
Number of proteins in data set with at least 2 peptides: 3174


## Nunez-Galindo et al. (2015)
Proteomics of Cerebrospinal Fluid: Throughput and Robustness Using a Scalable Automated Analysis Pipeline for Biomarker Discovery 
https://doi.org/10.1021/acs.analchem.5b02748

In [35]:
NunezGalindo2015 = pd.read_csv(data_path + "NunezGalindo2015.csv", sep=";", header=143, low_memory=False)

In [36]:
NunezGalindo2015 = NunezGalindo2015[["Protein accession numbers", "Exclusive unique peptide count"]]
NunezGalindo2015.columns = ["Name", "#Peptides_NunezGalindo2015"]
NunezGalindo2015.drop_duplicates(subset=["Name"], inplace=True)
NunezGalindo2015.dropna(inplace=True)
NunezGalindo2015

Unnamed: 0,Name,#Peptides_NunezGalindo2015
0,TIMP2_HUMAN,14.0
14,ANGT_HUMAN,10.0
24,PGAM1_HUMAN,1.0
25,SULF2_HUMAN,1.0
26,ROA1_HUMAN,2.0
...,...,...
88562,RN213_HUMAN,2.0
90305,OX2G_HUMAN,1.0
95458,CD109_HUMAN,2.0
118695,GCYB1_HUMAN,2.0


In [37]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "NunezGalindo2015_Uniprot_name.txt", "w") as f:
    for item in NunezGalindo2015["Name"]:
        f.write("%s\n" % item)

In [38]:
NunezGalindo2015_mapping = pd.read_csv(data_path + "NunezGalindo2015_Name_to_Uniprot.tab", sep="\t", header=0, 
    names=["Name", "Uniprot"])
NunezGalindo2015_mapping

Unnamed: 0,Name,Uniprot
0,TIMP2_HUMAN,P16035
1,ANGT_HUMAN,P01019
2,PGAM1_HUMAN,P18669
3,SULF2_HUMAN,Q8IWU5
4,ROA1_HUMAN,P09651
...,...,...
754,RN213_HUMAN,Q63HN8
755,OX2G_HUMAN,P41217
756,CD109_HUMAN,Q6YHK3
757,GCYB1_HUMAN,Q02153


In [39]:
# merge mapping table with protein list
NunezGalindo2015 = NunezGalindo2015_mapping.merge(NunezGalindo2015, how="inner", on="Name")
NunezGalindo2015.drop(labels="Name", axis=1, inplace=True)
NunezGalindo2015

Unnamed: 0,Uniprot,#Peptides_NunezGalindo2015
0,P16035,14.0
1,P01019,10.0
2,P18669,1.0
3,Q8IWU5,1.0
4,P09651,2.0
...,...,...
737,Q63HN8,2.0
738,P41217,1.0
739,Q6YHK3,2.0
740,Q02153,2.0


In [40]:
print("Number of proteins in data set with at least 1 peptide:", 
      len(NunezGalindo2015[NunezGalindo2015["#Peptides_NunezGalindo2015"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", 
      len(NunezGalindo2015[NunezGalindo2015["#Peptides_NunezGalindo2015"] > 1]))

Number of proteins in data set with at least 1 peptide: 742
Number of proteins in data set with at least 2 peptides: 526


## Zhang et al. (2015)
(Data for) a comprehensive map and functional annotation of the human cerebrospinal fluid proteome
https://doi.org/10.1016/j.dib.2015.02.004

In [41]:
Zhang2015 = pd.read_csv(data_path + "Zhang2015.csv", sep=";") # contains only proteins with at least two peptides identified
Zhang2015

Unnamed: 0,Accession,Protein Name,Molecular Weight,Protein Grouping Ambiguity,Flow-through Proteins,Original Proteins,Bound Proteins,Commonly identifiable proteins in normal CSF
0,A1L4H1,Soluble scavenger receptor cysteine-rich domai...,166 kDa,,17.0,14.0,0.0,
1,A2RU67,Uncharacterized protein KIAA1467 OS=Homo sapie...,67 kDa,,8.0,2.0,,Yes
2,A4D0S4,Laminin subunit beta-4 OS=Homo sapiens GN=LAMB...,194 kDa,True,2.0,0.0,1.0,
3,A4D0V7,Cadherin-like and PC-esterase domain-containin...,117 kDa,,6.0,1.0,,
4,A6BM72,Multiple epidermal growth factor-like domains ...,111 kDa,,2.0,2.0,,
...,...,...,...,...,...,...,...,...
2508,Q9Y6N8,Cadherin-10 OS=Homo sapiens GN=CDH10 PE=1 SV=2,88 kDa,True,11.0,10.0,1.0,Yes
2509,Q9Y6R7,IgGFc-binding protein OS=Homo sapiens GN=FCGBP...,572 kDa,,90.0,74.0,36.0,Yes
2510,Q9Y6U3,Adseverin OS=Homo sapiens GN=SCIN PE=1 SV=4,80 kDa,,2.0,,0.0,
2511,Q9Y6V0,Protein piccolo OS=Homo sapiens GN=PCLO PE=1 SV=4,553 kDa,,2.0,0.0,1.0,


In [42]:
Zhang2015 = Zhang2015[["Accession", "Flow-through Proteins", "Original Proteins", "Bound Proteins"]]
# find maximum number of identified peptides
Zhang2015["#Peptides_Zhang2015"] = Zhang2015[["Flow-through Proteins", "Original Proteins", "Bound Proteins"]].max(axis=1)
Zhang2015.drop(labels=["Flow-through Proteins", "Original Proteins", "Bound Proteins"], axis=1, inplace=True)
Zhang2015.columns = ["Uniprot", "#Peptides_Zhang2015"]
Zhang2015

Unnamed: 0,Uniprot,#Peptides_Zhang2015
0,A1L4H1,17.0
1,A2RU67,8.0
2,A4D0S4,2.0
3,A4D0V7,6.0
4,A6BM72,2.0
...,...,...
2508,Q9Y6N8,11.0
2509,Q9Y6R7,90.0
2510,Q9Y6U3,2.0
2511,Q9Y6V0,2.0


In [43]:
print("Number of proteins in data set with at least 1 peptide:", len(Zhang2015[Zhang2015["#Peptides_Zhang2015"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Zhang2015[Zhang2015["#Peptides_Zhang2015"] > 1]))

Number of proteins in data set with at least 1 peptide: 2513
Number of proteins in data set with at least 2 peptides: 2513


## Schilde et al. (2018)
Protein variability in cerebrospinal fluid and its possible implications for neurological protein biomarker research
https://doi.org/10.1371/journal.pone.0206478

In [44]:
Schilde2018 = pd.read_csv(data_path + "Schilde2018.csv", sep=";") 
Schilde2018

Unnamed: 0,Protein IDs,Gene name,Protein name,Unique peptides K1.1,Unique peptides K1.2,Unique peptides K1.3,Unique peptides K2.1,Unique peptides K2.2,Unique peptides K2.3,Unique peptides K5.1,...,LFQ intensity K36.3,LFQ intensity K38.1,LFQ intensity K38.2,LFQ intensity K38.3,LFQ intensity K39.1,LFQ intensity K39.2,LFQ intensity K39.3,LFQ intensity K40.1,LFQ intensity K40.2,LFQ intensity 40.3
0,A0A075B6S6,IGKV2D-30,Ig kappa chain V-II region RPMI 6410,1,1,1,2,2,2,2,...,0,0,123800000,0,0,0,0,0,0,0
1,A0A0A0MS15,IGHV3-49,,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,15237000,0,0
2,A0A0B4J1U7,IGHV6-1,,1,1,1,1,2,1,1,...,0,18752000,27609000,30311000,0,0,0,0,0,0
3,A0A0C4DH38,IGHV5-51,,2,2,2,3,3,2,2,...,21838000,24571000,15917000,10677000,14375000,42018000,40429000,39269000,26309000,21864000
4,A0A0C4DH68,IGKV2-24,,1,1,1,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,Q9Y646,CPQ,Carboxypeptidase Q,3,4,2,3,5,3,3,...,17758000,15153000,23284000,15860000,6475200,9381400,9038000,15989000,14767000,12986000
606,Q9Y696,CLIC4,Chloride intracellular channel protein 4,0,0,1,1,1,0,1,...,0,0,0,0,0,2587200,1913300,0,0,0
607,Q9Y6N7,ROBO1,Roundabout homolog 1,0,1,1,2,1,3,1,...,4011300,3720900,0,0,0,0,0,0,0,4033900
608,Q9Y6N8,CDH10,Cadherin-10,1,0,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
Schilde2018 = Schilde2018[["Protein IDs", "Unique peptides K1.1", "Unique peptides K1.2", "Unique peptides K1.3", 
                           "Unique peptides K2.1", "Unique peptides K2.2", "Unique peptides K2.3", "Unique peptides K5.1",
                           "Unique peptides K5.2", "Unique peptides K5.3", "Unique peptides K8.1", "Unique peptides K8.2", 
                           "Unique peptides K8.3", "Unique peptides K13.1", "Unique peptides K13.2", "Unique peptides K13.3",
                           "Unique peptides K14.1", "Unique peptides K14.2", "Unique peptides K14.3", "Unique peptides K26.1",
                           "Unique peptides K26.2", "Unique peptides K26.3", "Unique peptides K33.1", "Unique peptides K33.2",
                           "Unique peptides K33.3", "Unique peptides K36.1", "Unique peptides K36.2", "Unique peptides K36.3",
                           "Unique peptides K38.1", "Unique peptides K38.2", "Unique peptides K38.3", "Unique peptides K39.1",
                           "Unique peptides K39.2", "Unique peptides K39.3", "Unique peptides K40.1", "Unique peptides K40.2",
                           "Unique peptides K40.3"]]
# find maximum number of identified peptides
Schilde2018["#Peptides_Schilde2018"] = Schilde2018[["Unique peptides K1.1", "Unique peptides K1.2", "Unique peptides K1.3", 
                           "Unique peptides K2.1", "Unique peptides K2.2", "Unique peptides K2.3", "Unique peptides K5.1",
                           "Unique peptides K5.2", "Unique peptides K5.3", "Unique peptides K8.1", "Unique peptides K8.2", 
                           "Unique peptides K8.3", "Unique peptides K13.1", "Unique peptides K13.2", "Unique peptides K13.3",
                           "Unique peptides K14.1", "Unique peptides K14.2", "Unique peptides K14.3", "Unique peptides K26.1",
                           "Unique peptides K26.2", "Unique peptides K26.3", "Unique peptides K33.1", "Unique peptides K33.2",
                           "Unique peptides K33.3", "Unique peptides K36.1", "Unique peptides K36.2", "Unique peptides K36.3",
                           "Unique peptides K38.1", "Unique peptides K38.2", "Unique peptides K38.3", "Unique peptides K39.1",
                           "Unique peptides K39.2", "Unique peptides K39.3", "Unique peptides K40.1", "Unique peptides K40.2",
                           "Unique peptides K40.3"]].max(axis=1)

Schilde2018 = Schilde2018[["Protein IDs", "#Peptides_Schilde2018"]]
Schilde2018.columns = ["Uniprot", "#Peptides_Schilde2018"]
Schilde2018

Unnamed: 0,Uniprot,#Peptides_Schilde2018
0,A0A075B6S6,2.0
1,A0A0A0MS15,2.0
2,A0A0B4J1U7,3.0
3,A0A0C4DH38,3.0
4,A0A0C4DH68,2.0
...,...,...
605,Q9Y646,6.0
606,Q9Y696,2.0
607,Q9Y6N7,4.0
608,Q9Y6N8,2.0


In [46]:
print("Number of proteins in data set with at least 1 peptide:", len(Schilde2018[Schilde2018["#Peptides_Schilde2018"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Schilde2018[Schilde2018["#Peptides_Schilde2018"] > 1]))

Number of proteins in data set with at least 1 peptide: 610
Number of proteins in data set with at least 2 peptides: 610


## Stoop et al. (2010)
Quantitative Proteomics and Metabolomics Analysis of Normal Human Cerebrospinal Fluid Samples
https://doi.org/10.1074/mcp.m900877-mcp200

In [47]:
Stoop2010 = pd.read_csv(data_path + "Stoop2010.csv", sep=";", header=2) 
Stoop2010

Unnamed: 0,Primary accession number,Reference,number of unique peptides,Average sequence coverage per sample (%),Unnamed: 4,Unnamed: 5
0,P02768,ALBU_HUMAN Serum albumin,63.0,840,,
1,P01024,CO3_HUMAN Complement C3,62.0,387,,
2,P02787,TRFE_HUMAN Serotransferrin,58.0,714,,
3,P0C0L4,CO4A_HUMAN Complement C4-A,38.0,247,,
4,P05060,SCG1_HUMAN Secretogranin-1,28.0,450,,
...,...,...,...,...,...,...
1559,,,,,,
1560,,,,,,
1561,,,,,,
1562,,,,,,


In [48]:
Stoop2010 = Stoop2010[["Primary accession number", "number of unique peptides"]]
Stoop2010.columns = ["Uniprot", "#Peptides_Stoop2010"]
Stoop2010.dropna(inplace=True)
Stoop2010

Unnamed: 0,Uniprot,#Peptides_Stoop2010
0,P02768,63.0
1,P01024,62.0
2,P02787,58.0
3,P0C0L4,38.0
4,P05060,28.0
...,...,...
173,P62988,2.0
174,P30530,2.0
175,A6NLU5,2.0
176,A4D1P6,2.0


In [49]:
print("Number of proteins in data set with at least 1 peptide:", len(Stoop2010[Stoop2010["#Peptides_Stoop2010"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Stoop2010[Stoop2010["#Peptides_Stoop2010"] > 1]))

Number of proteins in data set with at least 1 peptide: 178
Number of proteins in data set with at least 2 peptides: 178


## Zougmann et al. (2007)

In [50]:
Zougman2007 = pd.read_csv(data_path + "Zougman2007.csv", sep=";", header=0, names=["IPI"])
Zougman2007

Unnamed: 0,IPI
0,IPI00000024 Isoform 1 of Protocadherin-1 precu...
1,IPI00000024 Isoform 1 of Protocadherin-1 precu...
2,IPI00000137 N-acetylglucosamine-1-phosphotrans...
3,IPI00000137 N-acetylglucosamine-1-phosphotrans...
4,IPI00000137 N-acetylglucosamine-1-phosphotrans...
...,...
6154,IPI00749009 Immunoglobulin-like domain contain...
6155,IPI00749009 Immunoglobulin-like domain contain...
6156,IPI00760824 IGHG1 protein
6157,IPI00760824 IGHG1 protein


In [51]:
# count number of peptides per Uniprot ID
Zougman2007["IPI"] = Zougman2007["IPI"].apply(lambda x: x.split(" ")[0])
# remove rows that do not contain IPI IDs
Zougman2007 = Zougman2007[Zougman2007["IPI"].str.contains("IPI")]
Zougman2007["#Peptides_Zougman2007"] = Zougman2007.groupby("IPI")["IPI"].transform("count")
Zougman2007.drop_duplicates(subset=["IPI"], inplace=True)
Zougman2007

Unnamed: 0,IPI,#Peptides_Zougman2007
0,IPI00000024,2
2,IPI00000137,6
8,IPI00000138,4
12,IPI00000190,2
14,IPI00000265,2
...,...,...
6141,IPI00746686,6
6147,IPI00746813,4
6151,IPI00748036,3
6154,IPI00749009,2


In [52]:
# save IPI IDs to text file for IPI to Uniprot mapping 
with open(data_path + "Zougman2007_IPI.txt", "w") as f:
    for item in Zougman2007["IPI"]:
        f.write("%s\n" % item)

Conversion tool: https://biodbnet-abcc.ncifcrf.gov/db/db2db.php

In [53]:
Zougman2007_mapping = pd.read_csv(data_path + "Zougman2007_IPI_to_Uniprot.csv", sep=";", header=0, names=["IPI", "Uniprot"])
# drop rows without successful mapping
Zougman2007_mapping = Zougman2007_mapping[Zougman2007_mapping["Uniprot"].str.contains("-") == False]
Zougman2007_mapping

Unnamed: 0,IPI,Uniprot
0,IPI00000024,Q08174
1,IPI00000137,Q9UJJ9
2,IPI00000138,P26572
3,IPI00000190,P60033
4,IPI00000265,Q5VUB5
...,...,...
775,IPI00654875,P0C0L5
776,IPI00654888,P03952
782,IPI00743517,Q13332
789,IPI00745313,Q8IUX7


In [54]:
# merge mapping table with protein list
Zougman2007 = Zougman2007_mapping.merge(Zougman2007, how="inner", on="IPI")
Zougman2007.drop(labels="IPI", axis=1, inplace=True)
Zougman2007.drop_duplicates(subset=["Uniprot"], inplace=True)
Zougman2007

Unnamed: 0,Uniprot,#Peptides_Zougman2007
0,Q08174,2
1,Q9UJJ9,6
2,P26572,4
3,P60033,2
4,Q5VUB5,2
...,...,...
637,P23284,9
638,P68871,7
639,P0C0L5,3
640,P03952,6


In [55]:
print("Number of proteins in data set with at least 1 peptide:", len(Zougman2007[Zougman2007["#Peptides_Zougman2007"] > 0]))
print("Number of proteins in data set with at least 2 peptides:", len(Zougman2007[Zougman2007["#Peptides_Zougman2007"] > 1]))

Number of proteins in data set with at least 1 peptide: 623
Number of proteins in data set with at least 2 peptides: 572


# Create CSF data set

In [56]:
all_csf = Macron2018A.merge(Macron2020, how="outer", on="Uniprot")
datasets = [Zhang2015, 
            Guldbrandsen2014_all, 
            Macron2018B, 
            Schutzer2010, 
            Pan2007, 
            NunezGalindo2015, 
            Zougman2007, 
            Schilde2018, 
            Guo2015_all,
            Stoop2010]

for dataset in datasets:
    all_csf = all_csf.merge(dataset, how="outer", on="Uniprot")

all_csf

Unnamed: 0,Uniprot,#Peptides_Macron2018A,#Peptides_Macron2020,#Peptides_Zhang2015,#Peptides_Guldbrandsen2014,#Peptides_Macron2018B,#Peptides_Schutzer2010,#Peptides_Pan2007,#Peptides_NunezGalindo2015,#Peptides_Zougman2007,#Peptides_Schilde2018,#Peptides_Guo2015,#Peptides_Stoop2010
0,Q6K0P9,2.0,,,,,,,,,,,
1,Q9GZZ8,1.0,3.0,,,,,,,,,,
2,P09529,3.0,3.0,4.0,4.0,1.0,,,,,,,
3,P61019,2.0,3.0,,,2.0,,,,,,,
4,Q9GZX9,4.0,4.0,3.0,3.0,4.0,5.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5776,Q15195,,,,,,,,,,,,2.0
5777,Q08EQ4,,,,,,,,,,,,2.0
5778,P62988,,,,,,,,,,,,2.0
5779,A4D1P6,,,,,,,,,,,,2.0


In [57]:
print("All Uniprot IDs are unique:", all_csf["Uniprot"].is_unique)

All Uniprot IDs are unique: True


## CSF studies with 1000+ proteins

In [58]:
# keep subset of studies with 1000+ proteins identified (7 studies)
csf = all_csf.loc[:, ["Uniprot", "#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
                   "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_Pan2007"]]
csf.dropna(how="all", subset=["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
                   "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_Pan2007"], 
                    inplace=True)

## Count studies per protein

In [59]:
all_csf["#Studies"] = all_csf[["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
    "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_Pan2007", 
    "#Peptides_NunezGalindo2015", "#Peptides_Zougman2007", "#Peptides_Schilde2018", "#Peptides_Guo2015", 
    "#Peptides_Stoop2010"]].count(axis=1)

In [60]:
csf["#Studies"] = csf[["#Peptides_Macron2018A", "#Peptides_Macron2020", "#Peptides_Zhang2015",
    "#Peptides_Guldbrandsen2014", "#Peptides_Macron2018B", "#Peptides_Schutzer2010", "#Peptides_Pan2007"]].count(axis=1)

# Save final data sets

In [63]:
all_csf.to_csv(os.getcwd() + "/Datasets/CSF/all_csf.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/all_csf_Uniprot.txt", "w") as f:
    for item in all_csf["Uniprot"]:
        f.write("%s\n" % item)

In [64]:
csf.to_csv(os.getcwd() + "/Datasets/CSF/csf.csv", index=False)

with open(os.getcwd() + "/Datasets/CSF/csf_Uniprot.txt", "w") as f:
    for item in csf["Uniprot"]:
        f.write("%s\n" % item)