In [1]:
%matplotlib inline
from ExKMC.Tree import Tree
from sklearn.datasets import make_blobs
import gdown
import pandas as pd
import copy
from sklearn.cluster import KMeans
from utils import calc_cost, plot_kmeans, plot_tree_boundary,plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
from utils import plot_confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os


In [2]:
# print(os.path.join(os.path.dirname(__file__),"./data/negtive.csv"))
# print(os.path.join(os.path.abspath('algorithms/Kmeans+.ipynb'),"./data/negtive.csv"))
# print(os.path.abspath('algorithms/Kmeans+.ipynb'))

## data preprocessing

In [3]:
# input data
def getDataDrive(url, output, isImport=False):
    """
    return pandas dataframe
    """
    if isImport:
        gdown.download(url=url, output=output, quiet=False)
    res = pd.read_csv(output)
    return res


neg = getDataDrive(
    url="https://drive.google.com/uc?id=1ocidTn7jUvCrLG_XJ6H9MiNUDexCkjFG",
    output='/home/sfy/Documents/VScodeProject/Thesis/data/negtive.csv'
)
pos = getDataDrive(
    url="https://drive.google.com/uc?id=1IyMPjACBkz96giGJ-Z4IMk-qzM-1CJ9G",
    output='/home/sfy/Documents/VScodeProject/Thesis/data/positive.csv',
)

X_neg = copy.deepcopy(neg)
X_pos = copy.deepcopy(pos)


In [4]:
pos_target = [1 for _ in range(X_pos.shape[0])]
neg_target = [0 for _ in range(X_neg.shape[0])]

X_pos['y'] = pos_target
X_neg['y'] = neg_target

X__ = pd.concat([X_pos,X_neg])

# exclue name -> X_
X_ = X__.loc[:, X__.columns!='name']

# exclude label -> X
y = X_['y']
X = X_.loc[:, X_.columns!='y']

# data preprocess
# Standardize data
# TODO train_test_split(X, y, test_size=0.4, random_state=0)

scaler = StandardScaler() 
scaled_df = scaler.fit_transform(X) 
  
# Normalizing the Data 
normalized_df = normalize(scaled_df) 
  
# Converting the numpy array into a pandas DataFrame 
X = pd.DataFrame(normalized_df) 


In [5]:
X.shape

(41257, 696)

## ExKMC algorithm

In [6]:
# kmeans pre train
k = 2
n = X.shape[0]


kmeans = KMeans(k, random_state=42)
kmeans.fit(X)

# confusion_matrix
# plot_confusion_matrix(y, kmeans.predict(X), np.array(list(X_.columns)), normalize=True)

def visualize_2d(data):
    from sklearn.decomposition import PCA
    pca = PCA(2)
    df = pca.fit_transform(data.to_numpy())

    kmeans_2 = KMeans(k, random_state=42)
    kmeans_2.fit(df)

    plot_kmeans(kmeans_2, x_data =df)

    tree_n = Tree(k)
    tree_n.fit(df, kmeans_2)

    plot_tree_boundary(tree_n, k, df, kmeans_2, plot_mistakes=True)

# visualize_2d(X)

In [7]:
cluster_labels = kmeans.fit_predict(X)

In [8]:
kmeans.cluster_centers_

array([[-0.00569809, -0.0144497 , -0.01241951, ...,  0.        ,
         0.        ,  0.        ],
       [-0.03171698, -0.01341128, -0.00573732, ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
kmeans.n_iter_

13

In [10]:
# test with inter 
def inter_visual():
    from yellowbrick.cluster import InterclusterDistance
    visualizer = InterclusterDistance(kmeans)
    visualizer.fit(X)
    visualizer.show()


In [11]:
# Initialize tree with up to 6 leaves, predicting 3 clusters
tree = Tree(k=k)

# Construct the tree, and return cluster labels
# prediction = tree.fit_predict(X,kmeans)
tree.fit(X, kmeans)

# Tree plot saved to filename
# tree.plot("test",feature_names=list(X_.columns))

<ExKMC.Tree.Tree at 0x7f84e33e36d0>

In [12]:
tree_labels = tree.fit_predict(X,kmeans)

In [13]:
tree_labels

array([0., 0., 0., ..., 0., 0., 1.])

In [14]:
# cost
# kmeas cost in paper: The k-means cost is the sum of squared distances of each point to the mean of points associated with the cluster.
# kmenas cost in sklearn:Opposite of the value of X on the K-means objective.
# surrogate cost:The k-means surrogate cost is the sum of squared distances of each point to the closest center of the kmeans given (or trained) in the fit method.k-means surrogate cost > k-means cost, as k-means cost is computed with respect to the optimal centers.

kmeas_cost = tree.score(X)
surrogate_score = tree.surrogate_score(X)
print(f"kmeans_cost is {kmeas_cost} \nsurrogate_score is {surrogate_score}\nkmenas cost in surrogate is {kmeans.score(X)}")


kmeans_cost is 37179.87612739552 
surrogate_score is 37290.76841843778
kmenas cost in surrogate is -36683.55002984239


## Evaluation

### inter and intra distance

In [15]:
from scipy.spatial import distance

In [16]:
# intra

# inter
#centroids

dst = distance.euclidean(kmeans.cluster_centers_[0], kmeans.cluster_centers_[1])
# kmeans.cluster_centers_

In [17]:
dst

0.5800166565176593

In [None]:
tree.get_ce

In [21]:
import sys
sys.path.append('../')
# sys.path.insert(0, '/home/amninder/Desktop/Folder_2')

In [27]:
from ExKMC_M.ExKMC.Tree import Tree

In [28]:
tree = Tree(k=k)
tree.fit(X, kmeans)

<ExKMC_M.ExKMC.Tree.Tree at 0x7f84b62bbfd0>

In [29]:
tree.get_centers()

array([[-0.17344127, -0.07444386, -0.02251826, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.18619418,  0.07991762,  0.02417399, ...,  0.        ,
         0.        ,  0.        ]])

### other useful build-in matrics (needs to implement one attribute)

In [18]:
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(
            data,
            estimator[-1].labels_,
            metric="euclidean",
            sample_size=300,
        )
    ]

    # Show the results
    formatter_result = (
        "{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}"
    )
    print(formatter_result.format(*results))


In [19]:
bench_k_means(kmeans=kmeans,name='kmeans',data=X,labels=y)

kmeans   	7.233s	16612858	0.017	0.017	0.017	0.024	0.017	0.103


In [None]:
# bench_k_means(kmeans=tree,name='tree',data=X,labels=y)
