This Python Jupyter notebook helps to download the gene expression dataset for breast cancer from the National Cancer Institute's CPTAC program. The output is saved as a .csv file, which can then be imported into R for further analysis.

In [1]:
!pip install cptac==1.1.2 > cptac.log
import cptac
import pandas as pd
import numpy as np

In [2]:
# The PAM50 list https://www.biostars.org/p/77590/
PAM50_genes = ['UBE2T', 'BIRC5', 'NUF2', 'CDC6', 'CCNB1', 'TYMS', 'MYBL2', 'CEP55',
               'MELK', 'NDC80', 'RRM2', 'UBE2C', 'CENPF', 'PTTG1', 'EXO1', 'ORC6',
               'ANLN', 'CCNE1', 'CDC20', 'MKI67', 'KIF2C', 'ACTR3B', 'MYC', 'EGFR',
               'KRT5', 'PHGDH', 'CDH3', 'MIA', 'KRT17', 'FOXC1', 'SFRP1', 'KRT14',
               'ESR1', 'SLC39A6', 'BAG1', 'MAPT', 'PGR', 'CXXC5', 'MLPH', 'BCL2',
               'MDM2', 'NAT1', 'FOXA1', 'BLVRA', 'MMP11', 'GPR160', 'FGFR4', 'GRB7',
               'TMEM45B', 'ERBB2']

In [3]:
cptac.list_datasets()
cptac.download(dataset="Brca")
brca = cptac.Brca()
print(brca.list_data())

Checking that brca index is up-to-date...



Below are the dataframes contained in this dataset and their dimensions:

acetylproteomics
	122 rows
	9868 columns
clinical
	122 rows
	18 columns
CNV
	122 rows
	23692 columns
derived_molecular
	122 rows
	36 columns
phosphoproteomics
	122 rows
	38775 columns
proteomics
	122 rows
	10107 columns
somatic_mutation
	24106 rows
	3 columns
transcriptomics
	122 rows
	23121 columns
None


In [4]:
transcriptomics = brca.get_transcriptomics()
transcriptomics[PAM50_genes].copy()
transcriptomics.to_csv('breast_cancer_transcriptomics.csv', index=True)
transcriptomics

Name,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,1.9265,2.4267,,5.3718,3.6437,-4.5301,-5.2608,0.6260,,4.7000,...,3.2268,5.1482,-0.0202,0.9270,5.1699,3.1333,0.9835,5.9499,2.8499,3.6497
CPT001846,3.6578,2.6524,,5.3253,-5.1069,-0.7692,5.6915,3.2060,,4.3115,...,3.2532,3.9191,-1.4924,-0.3262,3.2163,,2.0909,7.0671,2.0780,3.2274
X01BR001,0.9896,2.6363,,5.4734,-1.5052,-2.0918,,1.8816,,3.3434,...,1.9491,2.2365,0.1694,1.3074,2.8806,0.5653,1.5682,4.3852,2.1573,2.1127
X01BR008,0.5535,2.2119,,5.8701,3.2708,-1.9902,-5.0414,0.6957,,4.3095,...,3.1575,5.7822,-1.3608,0.3015,3.2620,1.7578,0.2989,5.4697,3.1783,3.1939
X01BR009,2.8359,3.3449,,6.9307,2.6488,-3.1664,,1.9784,-5.7975,4.6341,...,2.7678,3.5378,-0.5147,0.2798,3.0986,-3.2737,1.3036,5.0219,3.6868,2.8524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,2.1736,2.1405,,6.1540,-5.0777,-1.2765,5.3046,1.6543,-5.9063,5.1404,...,2.1051,4.7150,-0.7072,0.2520,4.9094,,1.2552,5.5590,4.2292,2.2011
X21BR002,,,,9.0054,-7.1573,,,6.2542,,5.1665,...,2.1742,4.1401,-1.2578,0.2582,5.1380,,1.5275,6.1792,3.8373,2.3024
X21BR010,-0.3261,1.0329,,6.9516,,-2.5778,,-2.3352,,3.4699,...,2.4175,2.0950,1.2070,2.3922,3.2352,-3.0783,2.7830,3.9970,1.5065,4.2235
X22BR005,-1.2102,0.6457,,6.7257,-1.3602,-1.7237,,2.3115,,4.3077,...,2.7801,5.3049,0.5553,1.8906,2.4904,0.7488,1.6893,4.1113,1.7474,4.3724


In [5]:
CNV = brca.get_CNV()
CNV.columns = CNV.columns.droplevel(1)
genomics = CNV[PAM50_genes].copy()
genomics.to_csv('breast_cancer_CNV.csv', index=True)
genomics

Name,UBE2T,BIRC5,NUF2,CDC6,CCNB1,TYMS,MYBL2,CEP55,MELK,NDC80,...,MDM2,NAT1,FOXA1,BLVRA,MMP11,GPR160,FGFR4,GRB7,TMEM45B,ERBB2
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,0.553,0.097,0.553,-0.435,-0.451,-0.542,0.592,0.006,0.123,-0.542,...,0.154,-0.416,-0.433,-0.434,-0.404,0.819,-0.451,-0.435,-0.411,-0.435
CPT001846,-0.151,0.222,0.278,0.222,-0.171,0.241,0.719,0.000,0.022,0.241,...,-0.101,-0.162,0.208,1.169,0.725,0.307,-0.400,0.222,-0.087,0.222
X01BR001,0.866,0.593,0.866,-0.084,-0.322,0.546,0.014,-0.097,-0.057,0.546,...,0.017,-0.309,-0.426,0.397,0.089,-0.015,-0.322,-0.084,0.010,-0.084
X01BR008,0.738,0.256,0.738,-0.056,-0.015,-0.177,0.282,0.167,0.320,-0.177,...,0.000,0.455,0.095,0.173,-0.010,-0.048,0.011,-0.056,-0.059,-0.056
X01BR009,0.425,-0.147,1.025,-0.127,-0.140,0.354,0.628,-0.180,0.000,0.354,...,-0.089,-0.250,0.799,0.488,0.730,0.370,-0.085,-0.127,-0.163,-0.127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,0.145,-0.080,0.145,0.270,-0.107,-0.377,0.549,0.435,-0.085,-0.377,...,0.151,-0.334,0.372,0.373,-0.121,0.385,-0.107,2.411,-0.314,2.411
X21BR002,0.405,-0.014,0.405,-0.014,0.360,-0.101,0.024,-0.029,0.000,-0.101,...,0.032,-0.409,0.034,0.010,0.097,0.452,0.407,-0.014,0.042,-0.014
X21BR010,0.794,0.189,0.794,0.206,0.248,-0.365,-0.003,-0.047,-0.035,-0.365,...,0.024,-0.002,-0.046,0.256,-0.035,0.303,0.248,0.206,-0.267,0.206
X22BR005,0.667,0.097,0.046,0.542,0.091,-0.316,0.160,-0.003,0.293,-0.316,...,0.127,0.126,0.052,-0.080,-0.108,0.136,0.091,3.657,-0.222,3.657


In [6]:
clinical = brca.get_clinical()
clinical.to_csv('breast_cancer_clinical_data.csv', index=True)
clinical

Name,Replicate_Measurement_IDs,Sample_Tumor_Normal,TMT.Plex,TMT.Channel,Stage,Ischemia.Time.in.Minutes,PAM50,NMF.Cluster,NMF.Cluster.Membership.Score,Age.in.Month,Gender,Ethnicity,ER.Updated.Clinical.Status,PR.Clinical.Status,ERBB2.Updated.Clinical.Status,TNBC.Updated.Clinical.Status,ERBB2.Proteogenomic.Status,TOP2A.Proteogenomic.Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
CPT000814,CPT000814,Tumor,13,127C,Stage IIA,,Basal,Basal-I,1.000,,,black.or.african.american,negative,negative,,positive,negative,negative
CPT001846,CPT001846,Tumor,12,128C,Stage III,,Basal,Basal-I,0.672,,,white,negative,negative,,positive,negative,negative
X01BR001,X01BR001,Tumor,2,129N,Stage IIB,0.0,Basal,Basal-I,0.782,660.0,female,black.or.african.american,negative,negative,negative,positive,negative,negative
X01BR008,X01BR008,Tumor,16,127C,,,Basal,Basal-I,0.958,,,,,,,,negative,negative
X01BR009,X01BR009,Tumor,16,127N,,,Basal,Basal-I,0.825,,,,negative,negative,,positive,negative,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,X21BR001,Tumor,16,128N,,,LumB,LumB-I,0.536,,,white,negative,negative,,negative,positive,negative
X21BR002,X21BR002,Tumor,16,128C,,,LumA,LumB-I,0.443,,,white,positive,positive,,negative,negative,negative
X21BR010,X21BR010|X21BR010.REP1,Tumor,3|17,129C|128C,Stage IIA,18.0,LumA,LumA-I,0.559,852.0,female,white,positive,positive,negative,negative,negative,negative
X22BR005,X22BR005,Tumor,6,129N,Stage IIB,20.0,LumA,LumB-I,0.392,552.0,female,white,positive,positive,positive,negative,positive,negative


Dataset stratification according to Etrogen Receptor (ER) and Progesterone Receptors (PR) status

In [23]:
ER_negative = genomics[clinical['ER.Updated.Clinical.Status'] == 'negative']
ER_negative.to_csv('ER_negative.csv', index=True)
print(len(ER_negative))

ER_positive = genomics[clinical['ER.Updated.Clinical.Status'] == 'positive']
ER_positive.to_csv('ER_positive.csv', index=True)
print(len(ER_positive))

PR_negative = genomics[clinical['PR.Clinical.Status'] == 'negative']
PR_negative.to_csv('PR_negative.csv', index=True)
print(len(PR_negative))

PR_positive = genomics[clinical['PR.Clinical.Status'] == 'positive']
PR_positive.to_csv('PR_positive.csv', index=True)
print(len(PR_positive))

39
81
47
68
