In [None]:
#About The Project
# About Project
#Breast cancer is primarily divied into five molecular sub-types using PAM50. 
#The subtypes are Luminal A, Luminal B, HER2, Basal like and Normal. This project aims to classify breast cancer subtypes using a proteomic dataset and machine learning algorithms.(KMeans and Linear Regression). These algorithms are unsupervised learning and supervised learning respectively. 
#The aim of the project is to determine the accuracy and efficacy of machine learning algorithms in classifying breast cancer subtypes by comparing the results of the classification provided by each algorithm with the general PAM50 subtype classification.

In [None]:
# Datasets Summary
# The first dataset , 77_cancer_proteomes_CPTAC_itraq.csv contains published iTRAQ proteome profiling of 77 breast cancer samples generated by the Clinical Proteomic Tumor Analysis Consortium (NCI/NIH). It contains expression values for ~12.000 proteins for each sample, with missing values present when a given protein could not be quantified in a given sample.
# The second dataset , clinical_data_breast_cancer.csv contains clinical data of 105 patients , analyzed, sampled, contained features like gender, age and included the type of cancer classification.
# The third dataset, PAM50_proteins.csv contains the list of genes and proteins used by the PAM50 classification system. The column RefSeqProteinID contains the protein IDs that can be matched with the IDs in the main protein expression data set.

In [None]:
# Import Packages

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [None]:
# Load data
df = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/CancerProteomes.csv")
df1 = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/PAM50 Proteins.csv")
df2 = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/ClinicalDataBreastCancer.csv")


In [None]:
#Find the Index Ref_Seq from df match with Ref_Seq of df1 and merge them together
df3 = df.merge(df1, left_on='RefSeq_accession_number', right_on='RefSeqProteinID')[[ 'RefSeq_accession_number', 'AO-A12D.01TCGA', 'C8-A131.01TCGA', 'AO-A12B.01TCGA', 'BH-A18Q.02TCGA', 'C8-A130.02TCGA', 'C8-A138.03TCGA', 'E2-A154.03TCGA', 'C8-A12L.04TCGA', 'A2-A0EX.04TCGA', 'AO-A12D.05TCGA', 'AN-A04A.05TCGA', 'BH-A0AV.05TCGA','C8-A12T.06TCGA', 'A8-A06Z.07TCGA', 'A2-A0CM.07TCGA', 'BH-A18U.08TCGA', 'A2-A0EQ.08TCGA', 'AR-A0U4.09TCGA', 'AO-A0J9.10TCGA', 'AR-A1AP.11TCGA','AN-A0FK.11TCGA', 'AO-A0J6.11TCGA','A7-A13F.12TCGA', 'BH-A0E1.12TCGA', 'A7-A0CE.13TCGA', 'A2-A0YC.13TCGA', 'AO-A0JC.14TCGA', 'A8-A08Z.14TCGA', 'AR-A0TX.14TCGA', 'A8-A076.15TCGA', 'AO-A126.15TCGA', 'BH-A0C1.16TCGA', 'A2-A0EY.16TCGA', 'AR-A1AW.17TCGA', 'AR-A1AV.17TCGA', 'C8-A135.17TCGA', 'A2-A0EV.18TCGA', 'AN-A0AM.18TCGA', 'D8-A142.18TCGA', 'AN-A0FL.19TCGA', 'BH-A0DG.19TCGA', 'AR-A0TV.20TCGA', 'C8-A12Z.20TCGA', 'AO-A0JJ.20TCGA', 'AO-A0JE.21TCGA', 'AN-A0AJ.21TCGA', 'A7-A0CJ.22TCGA', 'AO-A12F.22TCGA', 'A8-A079.23TCGA', 'A2-A0T3.24TCGA', 'A2-A0YD.24TCGA', 'AR-A0TR.25TCGA','AO-A03O.25TCGA', 'AO-A12E.26TCGA', 'A8-A06N.26TCGA', 'A2-A0YG.27TCGA', 'BH-A18N.27TCGA', 'AN-A0AL.28TCGA', 'A2-A0T6.29TCGA', 'E2-A158.29TCGA', 'E2-A15A.29TCGA', 'AO-A0JM.30TCGA', 'C8-A12V.30TCGA', 'A2-A0D2.31TCGA', 'C8-A12U.31TCGA', 'AR-A1AS.31TCGA', 'A8-A09G.32TCGA','C8-A131.32TCGA', 'C8-A134.32TCGA', 'A2-A0YF.33TCGA', 'BH-A0DD.33TCGA', 'BH-A0E9.33TCGA', 'AR-A0TT.34TCGA', 'AO-A12B.34TCGA', 'A2-A0SW.35TCGA', 'AO-A0JL.35TCGA', 'BH-A0BV.35TCGA', 'A2-A0YM.36TCGA', 'BH-A0C7.36TCGA', 'A2-A0SX.36TCGA','263d3f-I.CPTAC', 'blcdb9-I.CPTAC', 'c4155b-C.CPTAC' ]]
df.merge(df1, left_on='RefSeq_accession_number', right_on='RefSeqProteinID')[[ 'RefSeq_accession_number', 'AO-A12D.01TCGA', 'C8-A131.01TCGA', 'AO-A12B.01TCGA', 'BH-A18Q.02TCGA', 'C8-A130.02TCGA', 'C8-A138.03TCGA', 'E2-A154.03TCGA', 'C8-A12L.04TCGA', 'A2-A0EX.04TCGA', 'AO-A12D.05TCGA', 'AN-A04A.05TCGA', 'BH-A0AV.05TCGA','C8-A12T.06TCGA', 'A8-A06Z.07TCGA', 'A2-A0CM.07TCGA', 'BH-A18U.08TCGA', 'A2-A0EQ.08TCGA', 'AR-A0U4.09TCGA', 'AO-A0J9.10TCGA', 'AR-A1AP.11TCGA','AN-A0FK.11TCGA', 'AO-A0J6.11TCGA','A7-A13F.12TCGA', 'BH-A0E1.12TCGA', 'A7-A0CE.13TCGA', 'A2-A0YC.13TCGA', 'AO-A0JC.14TCGA', 'A8-A08Z.14TCGA', 'AR-A0TX.14TCGA', 'A8-A076.15TCGA', 'AO-A126.15TCGA', 'BH-A0C1.16TCGA', 'A2-A0EY.16TCGA', 'AR-A1AW.17TCGA', 'AR-A1AV.17TCGA', 'C8-A135.17TCGA', 'A2-A0EV.18TCGA', 'AN-A0AM.18TCGA', 'D8-A142.18TCGA', 'AN-A0FL.19TCGA', 'BH-A0DG.19TCGA', 'AR-A0TV.20TCGA', 'C8-A12Z.20TCGA', 'AO-A0JJ.20TCGA', 'AO-A0JE.21TCGA', 'AN-A0AJ.21TCGA', 'A7-A0CJ.22TCGA', 'AO-A12F.22TCGA', 'A8-A079.23TCGA', 'A2-A0T3.24TCGA', 'A2-A0YD.24TCGA', 'AR-A0TR.25TCGA','AO-A03O.25TCGA', 'AO-A12E.26TCGA', 'A8-A06N.26TCGA', 'A2-A0YG.27TCGA', 'BH-A18N.27TCGA', 'AN-A0AL.28TCGA', 'A2-A0T6.29TCGA', 'E2-A158.29TCGA', 'E2-A15A.29TCGA', 'AO-A0JM.30TCGA', 'C8-A12V.30TCGA', 'A2-A0D2.31TCGA', 'C8-A12U.31TCGA', 'AR-A1AS.31TCGA', 'A8-A09G.32TCGA','C8-A131.32TCGA', 'C8-A134.32TCGA', 'A2-A0YF.33TCGA', 'BH-A0DD.33TCGA', 'BH-A0E9.33TCGA', 'AR-A0TT.34TCGA', 'AO-A12B.34TCGA', 'A2-A0SW.35TCGA', 'AO-A0JL.35TCGA', 'BH-A0BV.35TCGA', 'A2-A0YM.36TCGA', 'BH-A0C7.36TCGA', 'A2-A0SX.36TCGA','263d3f-I.CPTAC', 'blcdb9-I.CPTAC', 'c4155b-C.CPTAC' ]]


In [None]:
## Impute missing values (maybe another method would work better?)
imputer = Imputer(missing_values='NaN', strategy='median', axis=1)
imputer = imputer.fit(processed_numerical_p50)
processed_numerical_p50 = imputer.transform(processed_numerical_p50)

In [None]:
# Perform a KMeans clustering


In [None]:
#Generate the data using make_blobs(), a convience function in scikit-learn used to generate synthetic cluster
features, true_labels = make_blobs(n_samples = 200, centers = 5, cluster_std = 2.75, random_state = 1)

In [None]:
range_n_clusters = [3, 4, 5]
for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

     # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(features) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(features)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(features, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)





In [None]:

kmeans = KMeans(n_clusters=5).fit(df3)
centroids = kmeans.cluster_centers_
print(centroids)

In [None]:
plt.scatter(df['x'], df['y'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()

In [None]:
# Convert dataset two into dataframe
df2 = pd.read_csv("C:/Users/kenny/Downloads/Documents/REU-DataScienceProgram/REU-Project/ClinicalDataBreastCancer.csv")

In [None]:
#select two columns
df4 = df2[['Complete TCGA ID','PAM50 mRNA']]

In [None]:
kmeans = KMeans(n_clusters=5).fit(df4)
centroids = kmeans.cluster_centers_
print(centroids)

plt.scatter(df['x'], df['y'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()