In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
from sklearn.cluster import KMeans

In [2]:
# set a seed, just in case
random.seed(1234)

In [3]:
# import data
features = pd.read_csv("nci.data.csv") 
features.drop(features.columns[0], axis = 1, inplace = True)
features = features.T

labels = pd.read_csv("nci.label.txt", header = None)
labels = labels[0].str.strip()
labels.loc[labels == 'K562A-repro'] = 'K562'
labels.loc[labels == 'K562B-repro'] = 'K562'
labels.loc[labels == 'MCF7A-repro'] = 'MCF7'
labels.loc[labels == 'MCF7D-repro'] = 'MCF7'

In [4]:
labels.unique()

array(['CNS', 'RENAL', 'BREAST', 'NSCLC', 'UNKNOWN', 'OVARIAN',
       'MELANOMA', 'PROSTATE', 'LEUKEMIA', 'K562', 'COLON', 'MCF7'],
      dtype=object)

In [5]:
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(features)

In [6]:
predictions = pd.DataFrame(kmeans.predict(features))
predictions = pd.concat([labels.reset_index(drop=True), predictions], axis = 1)
predictions.columns = ["label", "prediction"]

In [7]:
pd.crosstab(predictions["label"], predictions["prediction"]).T
# NSCLC is the only one that's slightly different 
# from the book's results (but in a way that's actually better)

label,BREAST,CNS,COLON,K562,LEUKEMIA,MCF7,MELANOMA,NSCLC,OVARIAN,PROSTATE,RENAL,UNKNOWN
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2,0,7,2,6,2,0,0,0,0,0,0
1,2,0,0,0,0,0,7,0,0,0,0,0
2,3,5,0,0,0,0,1,9,6,2,9,1


ESL, p. 514: "We see that the procedure is successful at grouping together samples of the same cancer. In fact, the two breast cancers in the second cluster were later found to be misdiagnosed and were melanomas that had metastasized"