In [6]:
%matplotlib inline
from ExKMC.Tree import Tree
from sklearn.datasets import make_blobs
import gdown
import pandas as pd
import copy
from sklearn.cluster import KMeans
from utils import calc_cost, plot_kmeans, plot_tree_boundary,plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
from utils import plot_confusion_matrix
import numpy as np
import matplotlib.pyplot as plt


## data preprocessing

In [7]:
# input data
def getDataDrive(url, output, isImport=False):
    """
    return pandas dataframe
    """
    if isImport:
        gdown.download(url=url, output=output, quiet=False)
    res = pd.read_csv(output)
    return res


neg = getDataDrive(
    url="https://drive.google.com/uc?id=1ocidTn7jUvCrLG_XJ6H9MiNUDexCkjFG",
    output="./data/negtive.csv",
)
pos = getDataDrive(
    url="https://drive.google.com/uc?id=1IyMPjACBkz96giGJ-Z4IMk-qzM-1CJ9G",
    output="./data/positive.csv",
)

X_neg = copy.deepcopy(neg)
X_pos = copy.deepcopy(pos)


In [8]:
pos_target = [1 for _ in range(X_pos.shape[0])]
neg_target = [0 for _ in range(X_neg.shape[0])]

X_pos['y'] = pos_target
X_neg['y'] = neg_target

X__ = pd.concat([X_pos,X_neg])

# exclue name -> X_
X_ = X__.loc[:, X__.columns!='name']

# exclude label -> X
y = X_['y']
X = X_.loc[:, X_.columns!='y']

# data preprocess
# Standardize data
scaler = StandardScaler() 
scaled_df = scaler.fit_transform(X) 
  
# Normalizing the Data 
normalized_df = normalize(scaled_df) 
  
# Converting the numpy array into a pandas DataFrame 
X = pd.DataFrame(normalized_df) 


In [9]:
X.shape

(41257, 696)

## ExKMC algorithm

In [10]:
# kmeans pre train
k = 2
n = X.shape[0]


kmeans = KMeans(k, random_state=42)
kmeans.fit(X)

# confusion_matrix
# plot_confusion_matrix(y, kmeans.predict(X), np.array(list(X_.columns)), normalize=True)

def visualize_2d(data):
    from sklearn.decomposition import PCA
    pca = PCA(2)
    df = pca.fit_transform(data.to_numpy())

    kmeans_2 = KMeans(k, random_state=42)
    kmeans_2.fit(df)

    plot_kmeans(kmeans_2, x_data =df)

    tree_n = Tree(k)
    tree_n.fit(df, kmeans_2)

    plot_tree_boundary(tree_n, k, df, kmeans_2, plot_mistakes=True)

# visualize_2d(X)

In [11]:
# Initialize tree with up to 6 leaves, predicting 3 clusters
tree = Tree(k=k)

# Construct the tree, and return cluster labels
# prediction = tree.fit_predict(X,kmeans)
tree.fit(X, kmeans)

# Tree plot saved to filename
tree.plot("test",feature_names=list(X_.columns))

In [19]:
# cost
# kmeas cost in paper: The k-means cost is the sum of squared distances of each point to the mean of points associated with the cluster.
# kmenas cost in sklearn:Opposite of the value of X on the K-means objective.
# surrogate cost:The k-means surrogate cost is the sum of squared distances of each point to the closest center of the kmeans given (or trained) in the fit method.k-means surrogate cost > k-means cost, as k-means cost is computed with respect to the optimal centers.

kmeas_cost = tree.score(X)
surrogate_score = tree.surrogate_score(X)
print(f"kmeans_cost is {kmeas_cost} \nsurrogate_score is {surrogate_score}\nkmenas cost in surrogate is {kmeans.score(X)}")


kmeans_cost is 37179.87612739552 
 surrogate_score is 37290.76841843778
 kmenas cost in surrogate is -36683.55002984238


## ICOT algorithm