In [1]:
# Apply the clustering algorithms to the same dataset to which you just applied the dimensionality reduction algorithms 
# In other words, treat the clustering algorithms as if they were dimensionality reduction algorithms. 
# Again, rerun your neural network learner on the newly projected data.

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import scale
import sklearn.metrics as sm
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import SparseRandomProjection
from sklearn.feature_selection import SelectPercentile
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
%matplotlib inline

In [2]:
def model_result(x_data, y_data):

    X, X_test_holdout, y, y_test_holdout = train_test_split(x_data, y_data, test_size=0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    clf = MLPClassifier(hidden_layer_sizes=(100,100), alpha=1.0, max_iter=500)
    clf.fit(X, y)
    y_pred = clf.predict(X_test_holdout)

    train_score = clf.score(X, y)
    test_score = clf.score(X_test_holdout, y_test_holdout)
    cm = sm.confusion_matrix(y_test_holdout, y_pred)
    return train_score, test_score, cm

def get_fresh_data():
    cancer = load_breast_cancer()
    X = scale(cancer.data)
    y = pd.DataFrame(cancer.target)
    y = y.values.flatten()
    return X,y

In [3]:
#################### Kmeans PCA Breastcancer NN ###############################
print "KMeans"
X,y = get_fresh_data()

pca = PCA(n_components=10).fit(X)
transformed_data = pca.transform(X)

clustering = KMeans(n_clusters=2, random_state = 5)
clustering.fit(transformed_data)
y = clustering.labels_


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)
print


#################### EM PCA Breastcancer NN ###############################
print "EM"
X,y = get_fresh_data()

pca = PCA(n_components=10).fit(X)
transformed_data = pca.transform(X)

gmm=GaussianMixture(n_components=2, covariance_type="full").fit(transformed_data)
labels = gmm.predict(transformed_data)
y = labels


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)

KMeans
[[41  0]
 [ 0 73]]
train:  0.998778998779
test:  0.990253411306

EM
[[21  5]
 [ 8 80]]
train:  0.981684981685
test:  0.928849902534


In [4]:
#################### Kmeans ICA Breastcancer NN ###############################
print "KMeans"
X,y = get_fresh_data()

ica = FastICA(n_components=10).fit(X)
transformed_data = ica.transform(X)

clustering = KMeans(n_clusters=2, random_state = 5)
clustering.fit(transformed_data)
y = clustering.labels_


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)
print


#################### EM ICA Breastcancer NN ###############################
print "EM"
X,y = get_fresh_data()

ica = FastICA(n_components=10).fit(X)
transformed_data = ica.transform(X)

gmm=GaussianMixture(n_components=2, covariance_type="full").fit(transformed_data)
labels = gmm.predict(transformed_data)
y = labels


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)

KMeans
[[36  3]
 [ 0 75]]
train:  0.980952380952
test:  0.976608187135

EM
[[114]]
train:  0.998290598291
test:  0.998050682261


In [5]:
#################### Kmeans RandomProj Breastcancer NN ###############################
print "KMeans"
X,y = get_fresh_data()

sp = SparseRandomProjection(n_components = 10)
transformed_data = sp.fit_transform(X)

clustering = KMeans(n_clusters=2, random_state = 5)
clustering.fit(transformed_data)
y = clustering.labels_


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)
print


#################### EM RandomProj Breastcancer NN ###############################
print "EM"
X,y = get_fresh_data()

sp = SparseRandomProjection(n_components = 10)
transformed_data = sp.fit_transform(X)

gmm=GaussianMixture(n_components=2, covariance_type="full").fit(transformed_data)
labels = gmm.predict(transformed_data)
y = labels


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)

KMeans
[[75  2]
 [ 0 37]]
train:  1.0
test:  0.990253411306

EM
[[63  3]
 [ 6 42]]
train:  0.985103785104
test:  0.931773879142


In [6]:
#################### Kmeans Univariate Breastcancer NN ###############################
print "KMeans"
X,y = get_fresh_data()

select = SelectPercentile(percentile=33)
select.fit(X, y)
transformed_data = select.transform(X)

clustering = KMeans(n_clusters=2, random_state = 5)
clustering.fit(transformed_data)
y = clustering.labels_


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)
print


#################### EM Univariate Breastcancer NN ###############################
print "EM"
X,y = get_fresh_data()

sp = SparseRandomProjection(n_components = 10)
transformed_data = sp.fit_transform(X)

gmm=GaussianMixture(n_components=2, covariance_type="full").fit(transformed_data)
labels = gmm.predict(transformed_data)
y = labels


train_scores = []
test_scores = []
for x in range(0,9):
    train_score, test_score, cm = model_result(transformed_data,y)
    train_scores.append(train_score) 
    test_scores.append(test_score)
    if x == 8:
        print cm
    
print "train: ", np.mean(train_scores)
print "test: ", np.mean(test_scores)

KMeans
[[83  0]
 [ 0 31]]
train:  0.999267399267
test:  0.998050682261

EM
[[62  6]
 [ 1 45]]
train:  0.984126984127
test:  0.941520467836
