# Tidying Data and Preliminary Exploration #

Load packages:

In [1]:
library("tidyverse")
library("stringr")
library("janitor")
library("tidymodels")

set.seed(1000)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


── [1mAttaching packages[22m ────────────────────

# Tidying Data #

Read in proteome and clinical data:

In [2]:
# reading the proteome and clinical data CSVs, cleaning column names
proteome_data <- read_csv("Original_Datasets/77_cancer_proteomes_CPTAC_itraq.csv")
clinical_data <- read_csv("Original_Datasets/clinical_data_breast_cancer.csv") 
clinical_data <- clean_names(clinical_data)

[1mRows: [22m[34m12553[39m [1mColumns: [22m[34m86[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): RefSeq_accession_number, gene_symbol, gene_name
[32mdbl[39m (83): AO-A12D.01TCGA, C8-A131.01TCGA, AO-A12B.01TCGA, BH-A18Q.02TCGA, C8...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m105[39m [1mColumns: [22m[34m30[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (17): Complete TCGA ID, Gender, ER Status, PR Status, HER2 Final Status,...
[32mdbl[39m (13): Age at Initial Pathologic Diagnosis, Days to Date of Last Contact,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Sp

Change complete_tcga_id in clinical and proteome data sets to uniform format.

In [3]:
# clinical data: change complete_tcga_id to uniform format
clinical_data[c("tcga", "tcga_code_1", "tcga_code_2")] <- str_split_fixed(clinical_data$complete_tcga_id, "-", 3)
clinical_data$tcga_id <- paste(clinical_data$tcga_code_1, clinical_data$tcga_code_2, sep = "-")
clinical_data_tidy <- clinical_data[-c(1, 31:33)] %>% relocate(tcga_id, .before = gender)

# proteome data: convert the donor columns into a single donor column
proteome_data_longer <- pivot_longer(proteome_data, 4:86, names_to = "complete_tcga_id", values_to = "protein_expression_log2_iTRAQ_ratios")
# proteome data: change complete_tcga_id to uniform format
proteome_data_longer[c("tcga_id", "tcga_id_2")] <- str_split_fixed(proteome_data_longer$complete_tcga_id, "\\.", 2)
proteome_data_longer_tidy <- proteome_data_longer[-c(4, 7)] %>% relocate(tcga_id, .before = RefSeq_accession_number)

Merge proteome and clinical data by tcga_id:

In [4]:
proteome_and_clinical_data_tidy <- merge(proteome_data_longer_tidy, clinical_data_tidy, by="tcga_id")

# validate that there are 77 donors and 12,553 genes per donor
donors_and_genes_per_donor <- proteome_and_clinical_data_tidy %>% group_by(tcga_id) %>% count()
donors_and_genes_per_donor
# 77 donors (tcga_id) and 12,553 genes (n) per donor

tcga_id,n
<chr>,<int>
A2-A0CM,12553
A2-A0D2,12553
A2-A0EQ,12553
A2-A0EV,12553
A2-A0EX,12553
A2-A0EY,12553
A2-A0SW,12553
A2-A0SX,12553
A2-A0T3,12553
A2-A0T6,12553


## Final Outputs: Tidying Data ##

In [5]:
#clinical_data_tidy 
#proteome_and_clinical_data_tidy

write.csv(proteome_and_clinical_data_tidy, "Data/proteome_and_clinical_data_tidy.csv")

# Finding Most Highly Expressed Proteins in Stage III+ Breast Cancer #

Determine top 10 most highly expressed proteins, when expression is averaged across all the III+ AJCC stages in the unsplit data set:

In [6]:
# filter for stage III+ cancer, average protein expression for each gene across all donors, and determine top 10 most highly expressed proteins
top_10_mean_protein_expression_genes_stage_III_plus <- proteome_and_clinical_data_tidy %>%
  filter(ajcc_stage == c("Stage III", "Stage IIIA", "Stage IIIB", "Stage IIIC", "Stage IV")) %>%
  group_by(RefSeq_accession_number) %>%
  summarize(mean_protein_expression_log2_iTRAQ_ratios = mean(protein_expression_log2_iTRAQ_ratios, na.rm = TRUE)) %>%
  arrange(desc(mean_protein_expression_log2_iTRAQ_ratios)) %>%
  slice(1:10)

# create gene_symbol and gene_name labels for the RefSeq_accession_number
labels <- proteome_and_clinical_data_tidy %>%
  filter(ajcc_stage == "Stage IV") %>%
  group_by(RefSeq_accession_number) %>%
  summarize(gene_symbol = gene_symbol,
            gene_name = gene_name)

# merge labels and top 10 proteins to label RefSeq_accession_number
top_10_mean_protein_expression_genes_stage_III_plus <- merge(labels, top_10_mean_protein_expression_genes_stage_III_plus, by="RefSeq_accession_number")
top_10_mean_protein_expression_genes_stage_III_plus

RefSeq_accession_number,gene_symbol,gene_name,mean_protein_expression_log2_iTRAQ_ratios
<chr>,<chr>,<chr>,<dbl>
NP_000430,PCSK1,neuroendocrine convertase 1 isoform 1 preproprotein,7.799745
NP_001096036,PCP4L1,Purkinje cell protein 4-like protein 1,6.458791
NP_001138582,NELL2,protein kinase C-binding protein NELL2 isoform d,6.602297
NP_001139334,MTMR11,myotubularin-related protein 11 isoform a,7.221338
NP_004354,CEACAM5,carcinoembryonic antigen-related cell adhesion molecule 5 preproprotein,7.994744
NP_060370,AURKAIP1,aurora kinase A-interacting protein,8.42215
NP_060774,PRR11,proline-rich protein 11,8.270051
NP_150375,KCNH7,potassium voltage-gated channel subfamily H member 7 isoform 1,6.770987
NP_803251,MAGEC3,melanoma-associated antigen C3 isoform 2,8.045737
NP_996805,CRIP3,cysteine-rich protein 3,6.384901


**These 10 proteins will potentially be used as parameters in our classifier.**

## Final Outputs: Finding Most Highly Expressed Proteins in Stage III+ Breast Cancer ##

In [7]:
#top_10_mean_protein_expression_genes_stage_III_plus

# Split Clinical Data into Training and Testing Sets #

In [8]:
clinical_data_tidy_split <- initial_split(data = clinical_data_tidy, prop = 0.75, strata = ajcc_stage)
clinical_data_tidy_training <- training(clinical_data_tidy_split)
clinical_data_tidy_testing <- testing(clinical_data_tidy_split)

clinical_data_tidy_training
clinical_data_tidy_testing

tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor_t1_coded,node,node_coded,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
AR-A0U4,FEMALE,54,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,5,5,ReacII,1,2,2,2
BH-A18R,FEMALE,50,Indeterminate,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,0,-2,4,4,Her2,3,1,1,1
C8-A12Z,FEMALE,45,Negative,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,-5,-11,4,4,Her2,2,1,1,1
C8-A135,FEMALE,64,Negative,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,-5,-11,7,1,Her2,1,1,2,2
AR-A0TR,FEMALE,68,Positive,Positive,Negative,T2,T_Other,N1,Positive,⋯,Luminal A,-3,-12,7,2,LumA/B,1,3,3,3
A2-A0YF,FEMALE,67,Positive,Negative,Negative,T1,T1,N0,Negative,⋯,Luminal A,-3,-12,1,2,ReacII,2,3,3,3
AO-A0J9,FEMALE,61,Positive,Positive,Negative,T2,T_Other,N3,Positive,⋯,Luminal A,-3,-2,2,4,ReacI,2,3,1,4
A8-A08Z,FEMALE,76,Positive,Positive,Negative,T4,T_Other,N3,Positive,⋯,Luminal A,-3,-5,4,2,LumA,2,3,3,3
AR-A1AP,FEMALE,80,Positive,Positive,Negative,T1,T1,N0,Negative,⋯,Luminal A,0,0,4,4,LumA/B,2,3,1,3
AR-A1AS,FEMALE,54,Positive,Positive,Negative,T2,T_Other,N1,Positive,⋯,Luminal A,-3,-7,4,2,LumA,3,3,3,3


tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor_t1_coded,node,node_coded,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
A2-A0CM,FEMALE,40,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A7-A0CE,FEMALE,57,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,0,-13,5,5,Basal,1,2,2,2
AO-A0J6,FEMALE,61,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,2,5,Basal,1,2,2,2
A2-A0YM,FEMALE,67,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,5,5,Basal,3,2,2,2
A2-A0D2,FEMALE,45,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,4,5,Basal,3,2,2,2
A2-A0SX,FEMALE,48,Negative,Negative,Negative,T1,T1,N0,Negative,⋯,Basal-like,-12,-13,4,5,Basal,1,2,2,2
AR-A1AQ,FEMALE,49,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,3,5,Basal,1,2,2,2
BH-A0AV,FEMALE,52,Negative,Negative,Negative,T1,T1,N0,Negative,⋯,Basal-like,0,-13,4,5,ReacII,1,2,2,2
BH-A0EE,FEMALE,68,Negative,Negative,Positive,T3,T_Other,N0,Negative,⋯,HER2-enriched,-9,-11,7,1,Basal,5,1,2,3
AO-A12D,FEMALE,43,Negative,Negative,Positive,T1,T1,N1,Positive,⋯,HER2-enriched,-10,-11,3,5,Her2,3,1,2,2


Merge training clinical and proteome data, and testing clinical and proteome data.

In [9]:
# merge proteome and clinical training data
proteome_and_clinical_data_training_merged <- merge(proteome_data_longer_tidy, clinical_data_tidy_training, by="tcga_id")
proteome_and_clinical_data_training_tidy <- mutate(proteome_and_clinical_data_training_merged, ajcc_stage = as_factor(ajcc_stage))
proteome_and_clinical_data_training_tidy

# merge proteome and clinical testing data
proteome_and_clinical_data_testing_merged <- merge(proteome_data_longer_tidy, clinical_data_tidy_testing, by="tcga_id")
proteome_and_clinical_data_testing_tidy <- mutate(proteome_and_clinical_data_testing_merged, ajcc_stage = as_factor(ajcc_stage))
proteome_and_clinical_data_testing_tidy

tcga_id,RefSeq_accession_number,gene_symbol,gene_name,protein_expression_log2_iTRAQ_ratios,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
A2-A0EQ,NP_002416,MMP10,stromelysin-2 preproprotein,-8.394639661,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_001092315,,annexin A8-like protein 1 isoform 1,-5.554935680,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_036545,QPCT,glutaminyl-peptide cyclotransferase precursor,,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_077306,SLC27A3,long-chain fatty acid transport protein 3,1.444819819,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_001020424,TPD52,tumor protein D52 isoform 2,-1.172912692,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_001186072,,xin actin-binding repeat-containing protein 2 isoform 3,,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_009075,PCGF2,polycomb group RING finger protein 2,0.300518619,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_000285,PHKG2,"phosphorylase b kinase gamma catalytic chain, liver/testis isoform isoform 1",-1.348959030,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_945341,NAPEPLD,N-acyl-phosphatidylethanolamine-hydrolyzing phospholipase D,,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
A2-A0EQ,NP_055824,PDZRN3,E3 ubiquitin-protein ligase PDZRN3,-0.950941221,FEMALE,64,Negative,Negative,Positive,⋯,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1


tcga_id,RefSeq_accession_number,gene_symbol,gene_name,protein_expression_log2_iTRAQ_ratios,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
A2-A0CM,NP_006038,,RNA-binding protein 12,0.411563265,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_060272,HEATR2,HEAT repeat-containing protein 2,-0.345181275,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_006388,RNASEH2A,ribonuclease H2 subunit A,-0.301099069,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_115901,PDZD4,PDZ domain-containing protein 4,0.275643129,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_002429,MRC1,macrophage mannose receptor 1 precursor,-0.517836583,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_055706,PDAP1,28 kDa heat- and acid-stable phosphoprotein,-0.694165408,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_060200,QRICH1,glutamine-rich protein 1,0.529115815,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_003322,TYK2,non-receptor tyrosine-protein kinase TYK2,-0.859473681,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_001186384,,bromodomain-containing protein 2 isoform 2,0.036864512,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1
A2-A0CM,NP_057078,ATP6V1D,V-type proton ATPase subunit D,0.323398852,FEMALE,40,Negative,Negative,Negative,⋯,Basal-like,-12,-13,4,4,Basal,4,2,1,1


## Final Outputs: Splitting Data ##

In [10]:
#proteome_and_clinical_data_training_tidy
#proteome_and_clinical_data_testing_tidy

In [11]:
clinical_data_tidy_split <- initial_split(data = clinical_data_tidy, prop = 0.75, strata = ajcc_stage)
clinical_data_tidy_training <- training(clinical_data_tidy_split)
clinical_data_tidy_testing <- testing(clinical_data_tidy_split)

In [12]:
clinical_data_tidy_training
clinical_data_tidy_testing

tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor_t1_coded,node,node_coded,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
AR-A0U4,FEMALE,54,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,5,5,ReacII,1,2,2,2
BH-A0AV,FEMALE,52,Negative,Negative,Negative,T1,T1,N0,Negative,⋯,Basal-like,0,-13,4,5,ReacII,1,2,2,2
A2-A0T1,FEMALE,55,Negative,Negative,Positive,T3,T_Other,N3,Positive,⋯,HER2-enriched,-5,-11,4,5,Her2,4,1,1,1
AR-A0TX,FEMALE,64,Positive,Positive,Positive,T1,T1,N1,Positive,⋯,HER2-enriched,-5,-11,1,1,ReacII,1,1,2,2
C8-A12Z,FEMALE,45,Negative,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,-5,-11,4,4,Her2,2,1,1,1
C8-A135,FEMALE,64,Negative,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,-5,-11,7,1,Her2,1,1,2,2
C8-A138,FEMALE,54,Positive,Negative,Positive,T2,T_Other,N2,Positive,⋯,HER2-enriched,-5,-2,1,2,Basal,2,1,3,2
AR-A0TR,FEMALE,68,Positive,Positive,Negative,T2,T_Other,N1,Positive,⋯,Luminal A,-3,-12,7,2,LumA/B,1,3,3,3
AO-A0J9,FEMALE,61,Positive,Positive,Negative,T2,T_Other,N3,Positive,⋯,Luminal A,-3,-2,2,4,ReacI,2,3,1,4
AR-A1AS,FEMALE,54,Positive,Positive,Negative,T2,T_Other,N1,Positive,⋯,Luminal A,-3,-7,4,2,LumA,3,3,3,3


tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor_t1_coded,node,node_coded,⋯,pam50_m_rna,sig_clust_unsupervised_m_rna,sig_clust_intrinsic_m_rna,mi_rna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_with_pam50,integrated_clusters_no_exp,integrated_clusters_unsup_exp
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
A2-A0D0,FEMALE,60,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,5,5,Basal,1,2,2,2
AO-A0J6,FEMALE,61,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,2,5,Basal,1,2,2,2
AR-A1AQ,FEMALE,49,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-12,-13,3,5,Basal,1,2,2,2
C8-A131,FEMALE,82,Negative,Negative,Negative,T2,T_Other,N2,Positive,⋯,Basal-like,0,-13,5,1,ReacII,1,2,2,2
E2-A159,FEMALE,50,Negative,Negative,Negative,T2,T_Other,N0,Negative,⋯,Basal-like,-10,-13,3,5,Basal,1,2,2,2
BH-A18R,FEMALE,50,Indeterminate,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,0,-2,4,4,Her2,3,1,1,1
BH-A0EE,FEMALE,68,Negative,Negative,Positive,T3,T_Other,N0,Negative,⋯,HER2-enriched,-9,-11,7,1,Basal,5,1,2,3
AO-A0JE,FEMALE,53,Negative,Negative,Positive,T2,T_Other,N2,Positive,⋯,HER2-enriched,-5,-11,4,4,Her2,5,1,1,1
C8-A12P,FEMALE,55,Negative,Negative,Positive,T2,T_Other,N1,Positive,⋯,HER2-enriched,-10,-11,4,1,Her2,3,1,1,1
A2-A0YF,FEMALE,67,Positive,Negative,Negative,T1,T1,N0,Negative,⋯,Luminal A,-3,-12,1,2,ReacII,2,3,3,3


In [None]:
clinical_data_tidy_training <- merge(clinical_data_tidy_training, proteome_and_clinical_data_tidy, by = "tcga_id")
clinical_data_tidy_testing <- merge(clinical_data_tidy_training, proteome_and_clinical_data_tidy, by = "tcga_id")


clinical_data_tidy_training
clinical_data_tidy_testing

# Exploration of Training Data #

In [None]:
# determine number of patients in each AJCC stage 
patients_per_stage <- proteome_and_clinical_data_training_tidy %>%
  group_by(ajcc_stage) %>%
  count() %>%
  mutate(patients = n / 12553)
patients_per_stage

- 11 stages, most patients in Stage II
- only 1 patient in stage IV: need to expand subset to patients in stage III+ (n=15)

**Clinical Data:**

In [None]:
patients_by_hormone_receptor_status <- clinical_data_tidy_training %>%
  group_by(er_status, pr_status) %>%
  summarise(count=n())
patients_by_hormone_receptor_status

patients_by_TNM_class <- clinical_data_tidy_training %>%
  group_by(tumor, node, metastasis) %>%
  summarise(count=n())
patients_by_TNM_class

## Final Outputs: Data Exploration ## 

In [None]:
#patients_per_stage
#patients_by_hormone_receptor_status
#patients_by_TNM_class