In [None]:
#About The Project
#Breast cancer is primarily divied into five molecular sub-types using PAM50. 
#The subtypes are Luminal A, Luminal B, HER2, Basal like and Normal. This project aims to classify breast cancer subtypes using a proteomic dataset and machine learning algorithms.(KMeans and Linear Regression). These algorithms are unsupervised learning and supervised learning respectively. 
#The original classification was based on the PAM5O geness based on the final mRNA prduct. This project used proteomic data which has protein as the final product.
#The aim of the project is to determine the accuracyand efficacy of machine learning algorithms in classifying breast cancer subtypes by comparing the results of the classification provided by each algorithm with the general PAM50 subtype classification.

In [None]:
# Datasets Summary
# The first dataset , 77_cancer_proteomes_CPTAC_itraq.csv contains published iTRAQ proteome profiling of 77 breast cancer samples generated by the Clinical Proteomic Tumor Analysis Consortium (NCI/NIH). It contains expression values for ~12.000 proteins for each sample, with missing values present when a given protein could not be quantified in a given sample.
# The second dataset , clinical_data_breast_cancer.csv contains clinical data of 105 patients , analyzed, sampled, contained features like gender, age and included the type of cancer classification.
# The third dataset, PAM50_proteins.csv contains the list of genes and proteins used by the PAM50 classification system. The column RefSeqProteinID contains the protein IDs that can be matched with the IDs in the main protein expression data set.

In [429]:
# Import Packages

import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import re
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, adjusted_mutual_info_score
from sklearn.decomposition import PCA
import sklearn.linear_model
from sklearn.impute import SimpleImputer

In [430]:
# Load the datasets
proteomedata = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/REU-FinalProject/CancerProteomes.csv", index_col = 0)
pam50data = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/REU-FinalProject/PAM50 Proteins.csv", header = 0)
clinicaldata = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/REU-FinalProject/ClinicalDataBreastCancer.csv", index_col = 0)

In [431]:
#Load first dataset (proteome dataset)
proteomedata = df_data.drop(['gene_symbol','gene_name'], axis=1)
proteomedata

Unnamed: 0,RefSeq_accession_number,AO-A12D.01TCGA,C8-A131.01TCGA,AO-A12B.01TCGA,BH-A18Q.02TCGA,C8-A130.02TCGA,C8-A138.03TCGA,E2-A154.03TCGA,C8-A12L.04TCGA,A2-A0EX.04TCGA,...,AO-A12B.34TCGA,A2-A0SW.35TCGA,AO-A0JL.35TCGA,BH-A0BV.35TCGA,A2-A0YM.36TCGA,BH-A0C7.36TCGA,A2-A0SX.36TCGA,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
0,NP_958782,1.096131,2.609943,-0.659828,0.195341,-0.494060,2.765081,0.862659,1.407570,1.185108,...,-0.963904,-0.487772,-0.10668,-0.065838,0.655850,-0.552212,-0.398560,0.598585,-0.191285,0.566975
1,NP_958785,1.111370,2.650422,-0.648742,0.215413,-0.503899,2.779709,0.870186,1.407570,1.192612,...,-0.938210,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702
2,NP_958786,1.111370,2.650422,-0.654285,0.215413,-0.500619,2.779709,0.870186,1.410312,1.188860,...,-0.943919,-0.487772,-0.10668,-0.065838,0.655850,-0.552212,-0.392601,0.603993,-0.186022,0.576747
3,NP_000436,1.107561,2.646374,-0.632113,0.205377,-0.510459,2.797995,0.866423,1.407570,1.185108,...,-0.935355,-0.487772,-0.10668,-0.055893,0.655850,-0.552212,-0.392601,0.603993,-0.186022,0.576747
4,NP_958781,1.115180,2.646374,-0.640428,0.215413,-0.503899,2.787023,0.870186,1.413053,1.200116,...,-0.935355,-0.503853,-0.10668,-0.062523,0.651264,-0.556675,-0.395581,0.603993,-0.167079,0.576747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12548,NP_001191293,,,,-16.029761,1.729692,4.107251,-9.584499,-5.196859,-6.101005,...,-6.662350,,,,,,,-8.020071,-3.093822,-4.602418
12549,NP_775791,,,,-2.046065,-0.425182,-3.203370,-4.786183,,,...,,,,,,,,0.049608,-0.646977,0.240590
12550,NP_004065,,,,-1.778435,-0.149673,1.971481,-3.103949,-0.933726,-1.726336,...,,,,,,,,,,
12551,NP_068752,-0.633517,4.840325,-1.965192,,,,,,,...,,,,,,,,0.019861,-1.718327,-0.369183


In [432]:
#Load second dataset
pam50data

Unnamed: 0,GeneSymbol,RefSeqProteinID,Species,Gene Name
0,MIA,NP_006524,Homo sapiens,melanoma inhibitory activity
1,FGFR4,NP_002002,Homo sapiens,fibroblast growth factor receptor 4
2,FGFR4,NP_998812,Homo sapiens,fibroblast growth factor receptor 4
3,FGFR4,NP_075252,Homo sapiens,fibroblast growth factor receptor 4
4,GPR160,NP_055188,Homo sapiens,G protein-coupled receptor 160
...,...,...,...,...
95,FOXC1,NP_001444,Homo sapiens,forkhead box C1
96,GRB7,NP_001025173,Homo sapiens,growth factor receptor-bound protein 7
97,GRB7,NP_005301,Homo sapiens,growth factor receptor-bound protein 7
98,MELK,NP_055606,Homo sapiens,maternal embryonic leucine zipper kinase


In [433]:
#load third dataset (clinical dataset)
clinicaldata

Unnamed: 0_level_0,Gender,Age at Initial Pathologic Diagnosis,ER Status,PR Status,HER2 Final Status,Tumor,Tumor--T1 Coded,Node,Node-Coded,Metastasis,...,PAM50 mRNA,SigClust Unsupervised mRNA,SigClust Intrinsic mRNA,miRNA Clusters,methylation Clusters,RPPA Clusters,CN Clusters,Integrated Clusters (with PAM50),Integrated Clusters (no exp),Integrated Clusters (unsup exp)
Complete TCGA ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A2-A0T2,FEMALE,66,Negative,Negative,Negative,T3,T_Other,N3,Positive,M1,...,Basal-like,0,-13,3,5,Basal,3,2,2,2
TCGA-A2-A0CM,FEMALE,40,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,4,4,Basal,4,2,1,1
TCGA-BH-A18V,FEMALE,48,Negative,Negative,Negative,T2,T_Other,N1,Positive,M0,...,Basal-like,-12,-13,5,5,Basal,1,2,2,2
TCGA-BH-A18Q,FEMALE,56,Negative,Negative,Negative,T2,T_Other,N1,Positive,M0,...,Basal-like,-12,-13,5,5,Basal,1,2,2,2
TCGA-BH-A0E0,FEMALE,38,Negative,Negative,Negative,T3,T_Other,N3,Positive,M0,...,Basal-like,0,-13,5,5,Basal,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-BH-A0C7,FEMALE,48,Positive,Negative,Positive,T2,T_Other,N1,Positive,M0,...,Luminal B,-3,0,4,4,LumA/B,5,4,1,3
TCGA-BH-A0DD,MALE,58,Positive,Positive,Positive,T2,T_Other,N1,Positive,M0,...,Luminal B,-3,-6,4,4,LumA/B,3,4,1,3
TCGA-C8-A12U,FEMALE,46,Positive,Positive,Negative,T2,T_Other,N1,Positive,M0,...,Luminal B,-5,-2,5,4,Basal,5,4,1,1
TCGA-C8-A12W,FEMALE,49,Positive,Positive,Negative,T4,T_Other,N1,Positive,M0,...,Luminal B,-5,-2,4,4,ReacII,3,4,1,1


In [434]:
#Drop unused columns in the first dataset
proteomedata = df_data.drop(['gene_symbol','gene_name'], axis=1)
proteomedata

Unnamed: 0,RefSeq_accession_number,AO-A12D.01TCGA,C8-A131.01TCGA,AO-A12B.01TCGA,BH-A18Q.02TCGA,C8-A130.02TCGA,C8-A138.03TCGA,E2-A154.03TCGA,C8-A12L.04TCGA,A2-A0EX.04TCGA,...,AO-A12B.34TCGA,A2-A0SW.35TCGA,AO-A0JL.35TCGA,BH-A0BV.35TCGA,A2-A0YM.36TCGA,BH-A0C7.36TCGA,A2-A0SX.36TCGA,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
0,NP_958782,1.096131,2.609943,-0.659828,0.195341,-0.494060,2.765081,0.862659,1.407570,1.185108,...,-0.963904,-0.487772,-0.10668,-0.065838,0.655850,-0.552212,-0.398560,0.598585,-0.191285,0.566975
1,NP_958785,1.111370,2.650422,-0.648742,0.215413,-0.503899,2.779709,0.870186,1.407570,1.192612,...,-0.938210,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702
2,NP_958786,1.111370,2.650422,-0.654285,0.215413,-0.500619,2.779709,0.870186,1.410312,1.188860,...,-0.943919,-0.487772,-0.10668,-0.065838,0.655850,-0.552212,-0.392601,0.603993,-0.186022,0.576747
3,NP_000436,1.107561,2.646374,-0.632113,0.205377,-0.510459,2.797995,0.866423,1.407570,1.185108,...,-0.935355,-0.487772,-0.10668,-0.055893,0.655850,-0.552212,-0.392601,0.603993,-0.186022,0.576747
4,NP_958781,1.115180,2.646374,-0.640428,0.215413,-0.503899,2.787023,0.870186,1.413053,1.200116,...,-0.935355,-0.503853,-0.10668,-0.062523,0.651264,-0.556675,-0.395581,0.603993,-0.167079,0.576747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12548,NP_001191293,,,,-16.029761,1.729692,4.107251,-9.584499,-5.196859,-6.101005,...,-6.662350,,,,,,,-8.020071,-3.093822,-4.602418
12549,NP_775791,,,,-2.046065,-0.425182,-3.203370,-4.786183,,,...,,,,,,,,0.049608,-0.646977,0.240590
12550,NP_004065,,,,-1.778435,-0.149673,1.971481,-3.103949,-0.933726,-1.726336,...,,,,,,,,,,
12551,NP_068752,-0.633517,4.840325,-1.965192,,,,,,,...,,,,,,,,0.019861,-1.718327,-0.369183


In [435]:
#Rename the "TGCA-ID" in the proteome dataset and transpose into row to align with the clinical dataset

proteomedata.rename(columns=lambda x: "TCGA-%s" % (re.split('[_|-|.]',x)[0]) if bool(re.search("TCGA",x)) is True else x,inplace=True)
proteomedata = proteomedata.transpose()
proteomedata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12543,12544,12545,12546,12547,12548,12549,12550,12551,12552
RefSeq_accession_number,NP_958782,NP_958785,NP_958786,NP_000436,NP_958781,NP_958780,NP_958783,NP_958784,NP_112598,NP_001611,...,NP_001193600,NP_061134,NP_932347,NP_003593,NP_997203,NP_001191293,NP_775791,NP_004065,NP_068752,NP_219494
TCGA-AO-A12D,1.096131,1.11137,1.11137,1.107561,1.11518,1.107561,1.11137,1.11137,-1.51739,0.482754,...,,,,-0.340163,,,,,-0.633517,12.666488
TCGA-C8-A131,2.609943,2.650422,2.650422,2.646374,2.646374,2.646374,2.650422,2.650422,3.909313,-1.045294,...,,,,3.451902,,,,,4.840325,0.140736
TCGA-AO-A12B,-0.659828,-0.648742,-0.654285,-0.632113,-0.640428,-0.654285,-0.648742,-0.648742,-0.618256,1.222003,...,,,,-1.718531,,,,,-1.965192,-2.854835
TCGA-BH-A18Q,0.195341,0.215413,0.215413,0.205377,0.215413,0.215413,0.215413,0.215413,-1.03576,-0.517226,...,0.048144,,-0.881872,2.527072,-8.111243,-16.029761,-2.046065,-1.778435,,-3.069752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-BH-A0C7,-0.552212,-0.547749,-0.552212,-0.552212,-0.556675,-0.547749,-0.552212,-0.552212,0.679466,0.487574,...,,,,,,,,,,
TCGA-A2-A0SX,-0.39856,-0.392601,-0.392601,-0.392601,-0.395581,-0.392601,-0.392601,-0.392601,-2.504862,0.69481,...,,,,,,,,,,
263d3f-I.CPTAC,0.598585,0.606697,0.603993,0.603993,0.603993,0.606697,0.603993,0.603993,-0.602132,2.778263,...,-2.162522,1.520756,,,,-8.020071,0.049608,,0.019861,
blcdb9-I.CPTAC,-0.191285,-0.183918,-0.186022,-0.186022,-0.167079,-0.183918,-0.186022,-0.186022,-0.340726,1.36733,...,-4.357763,-2.386605,,,,-3.093822,-0.646977,,-1.718327,


In [436]:
#Drop entries in the clinical dataset thate are not in the proteome dataset
clinicaldata = clinicaldata.loc[[x for x in clinicaldata.index.tolist() 
if x in proteomedata.index],:]

In [437]:
clinicaldata

Unnamed: 0_level_0,Gender,Age at Initial Pathologic Diagnosis,ER Status,PR Status,HER2 Final Status,Tumor,Tumor--T1 Coded,Node,Node-Coded,Metastasis,...,PAM50 mRNA,SigClust Unsupervised mRNA,SigClust Intrinsic mRNA,miRNA Clusters,methylation Clusters,RPPA Clusters,CN Clusters,Integrated Clusters (with PAM50),Integrated Clusters (no exp),Integrated Clusters (unsup exp)
Complete TCGA ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A2-A0CM,FEMALE,40,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,4,4,Basal,4,2,1,1
TCGA-BH-A18Q,FEMALE,56,Negative,Negative,Negative,T2,T_Other,N1,Positive,M0,...,Basal-like,-12,-13,5,5,Basal,1,2,2,2
TCGA-A7-A0CE,FEMALE,57,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,0,-13,5,5,Basal,1,2,2,2
TCGA-D8-A142,FEMALE,74,Negative,Negative,Negative,T3,T_Other,N0,Negative,M0,...,Basal-like,0,-13,3,5,X,1,2,2,2
TCGA-AO-A0J6,FEMALE,61,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,2,5,Basal,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-AR-A1AV,MALE,68,Positive,Positive,Negative,T2,T_Other,N1,Positive,M0,...,Luminal B,-5,-2,7,3,LumA/B,3,4,4,5
TCGA-BH-A0C7,FEMALE,48,Positive,Negative,Positive,T2,T_Other,N1,Positive,M0,...,Luminal B,-3,0,4,4,LumA/B,5,4,1,3
TCGA-BH-A0DD,MALE,58,Positive,Positive,Positive,T2,T_Other,N1,Positive,M0,...,Luminal B,-3,-6,4,4,LumA/B,3,4,1,3
TCGA-C8-A12U,FEMALE,46,Positive,Positive,Negative,T2,T_Other,N1,Positive,M0,...,Luminal B,-5,-2,5,4,Basal,5,4,1,1


In [438]:
#Merge the first and third datasets
merged = proteomedata.merge(clinicaldata,left_index=True,right_index=True)
merged

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,PAM50 mRNA,SigClust Unsupervised mRNA,SigClust Intrinsic mRNA,miRNA Clusters,methylation Clusters,RPPA Clusters,CN Clusters,Integrated Clusters (with PAM50),Integrated Clusters (no exp),Integrated Clusters (unsup exp)
TCGA-A2-A0CM,0.683404,0.694424,0.698098,0.687077,0.687077,0.698098,0.698098,0.698098,-2.65215,-0.984373,...,Basal-like,-12,-13,4,4,Basal,4,2,1,1
TCGA-A2-A0D2,0.107491,0.104164,0.107491,0.097512,0.104164,0.104164,0.104164,0.104164,-0.880454,-1.512473,...,Basal-like,-12,-13,4,5,Basal,3,2,2,2
TCGA-A2-A0EQ,-0.91267,-0.927979,-0.927979,-0.931806,-0.927979,-0.927979,-0.927979,-0.927979,-3.071151,-2.278943,...,HER2-enriched,-5,-2,5,4,Basal,4,4,1,1
TCGA-A2-A0EV,0.452986,0.47259,0.47259,0.458587,0.47259,0.47259,0.47259,0.47259,-0.742871,1.811277,...,Luminal A,-4,0,4,2,ReacI,3,3,3,4
TCGA-A2-A0EX,1.185108,1.192612,1.18886,1.185108,1.200116,1.18886,1.18886,1.192612,1.046289,2.138081,...,Luminal A,-7,-5,4,4,ReacI,4,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-C8-A138,2.765081,2.779709,2.779709,2.797995,2.787023,2.779709,2.783366,2.783366,2.205538,0.749997,...,HER2-enriched,-5,-2,1,2,Basal,2,1,3,2
TCGA-D8-A142,0.538596,0.542211,0.542211,0.534981,0.542211,0.542211,0.542211,0.542211,-0.148205,0.26749,...,Basal-like,0,-13,3,5,X,1,2,2,2
TCGA-E2-A154,0.862659,0.870186,0.870186,0.866423,0.870186,0.870186,0.870186,0.870186,1.920171,2.349197,...,Luminal A,-3,-12,6,4,LumA,2,3,3,3
TCGA-E2-A158,-1.086529,-1.095492,-1.095492,-1.095492,-1.095492,-1.093252,-1.093252,-1.093252,0.096627,-1.149272,...,Basal-like,0,-13,5,5,Basal,1,2,2,2


In [439]:
#processsed

numerical_data= merged.loc[:,[x for x in merged.columns if bool(re.search("NP_|XP_",x)) == True]]

TypeError: expected string or bytes-like object

In [None]:

pam50_data = numerical_data.iloc[:,numerical_data.columns.isin(pam50['RefSeqProteinID'])]
pam50_data

In [None]:
#Impute missing values
imputer = SimpleImputer(missing_values='NaN', strategy='median')
imputer = imputer.fit(processed_numerical_p50)
processed_numerical_p50 = imputer.transform(processed_numerical_p50)

In [None]:
#PERFORM KMEANS

scaler = StandardScaler().fit(pam50_data_)
pam50_data_ = scaler.transform(pam50_data_)
np.set_printoptions(precision=3)
print(pam50_data_[0:5,:])

In [None]:
number of clusters = 4
n_clusters = [2,3,4,5]

    for k in k_list:
        clusterer = KMeans(n_clusters=k, n_jobs=4)
        clusterer.fit(data)
                ## The higher (up to 1) the better
        print("Silhouette Coefficient for k == %s: %s" % (
        k, round(metrics.silhouette_score(data, clusterer.labels_), 4)))
        ## The higher (up to 1) the better
        print("Homogeneity score for k == %s: %s" % (
        k, round(metrics.homogeneity_score(processed['PAM50 mRNA'], clusterer.labels_),4)))
        print("------------------------")