In [1]:
import numpy as np
import pandas as pd


data = pd.read_csv("data/wdbc.data", sep=",", header=None)

In [2]:
# denote features and improve readability
features = ["radius", "texture", "perimeter", "area", "smoothness", "compactness", "concavity", "concave pts", 
            "symmetry", "frac. dim"]
features3 = []
descr = ["mean", "stderr", "worst"]
for i in range(30):
    if i < 10: 
        features3.append(descr[0] + " "+ features[i%10])
    elif i < 20: 
        features3.append(descr[1] + " " + features[i%10])
    else: 
        features3.append(descr[2] + " " + features[i%10])
data.columns = ["ID", "Malignant/Benign"] + features3

data_list = data.values.tolist()
data_y = []
data_x = []
for elem1 in data_list:
    if elem1[1] == 'M':
        data_y.append(1)
    else:
        data_y.append(0)
for elem2 in data_list:
    data_x.append(elem2[2:])

train_data = np.array(data_x[0:469])
train_data_labels = np.array(data_y[0:469])
test_data = np.array(data_x[469:])
test_data_labels = np.array(data_y[469:])

In [3]:
def get_distance(x1, x2):
    sum = 0
    for i in range(len(x1)):
        sum += (x1[i] - x2[i]) ** 2
    return np.sqrt(sum)

def kmeans(X, k, max_iters):
    
    centroids = X[np.random.choice(range(len(X)), k, replace=False)]
    
    converged = False
    
    current_iter = 0
    while (not converged) and (current_iter < max_iters):
        cluster_list = [[] for i in range(len(centroids))]
        
        for x in X:
            distances_list = []
            for c in centroids:
                distances_list.append(get_distance(c, x))
            cluster_list[int(np.argmin(distances_list))].append(x)
            
        cluster_list = list((filter(None, cluster_list)))
        
        prev_centroids = centroids.copy()
        
        centroids = []
        
        for j in range(len(cluster_list)):
            centroids.append(np.mean(cluster_list[j], axis=0))
            
        pattern = np.abs(np.sum(prev_centroids) - np.sum(centroids))
        
        print('K-MEANS: ', int(pattern))
        
        converged = (pattern == 0)
        
        current_iter += 1
        
    return np.array(centroids), [np.std(x) for x in cluster_list]

class RBF:
    
    def __init__(self, X, y, tX, ty, num_of_classes, k, std_from_clusters=True):
        self.X = X
        self.y = y
        # print(np.transpose(self.y[np.newaxis]))
                        
        self.tX = tX
        self.ty = ty
        
        self.number_of_classes = num_of_classes
        self.k = k
        self.std_from_clusters = std_from_clusters
        
    def convert_to_one_hot(self, x, num_of_classes):
        arr = np.zeros((len(x), num_of_classes))
        for i in range(len(x)):
            c = int(x[i])
            arr[i][c] = 1
        return arr
    
    def rbf(self, x, c, s):
        distance = get_distance(x, c)
        return 1 / np.exp(-distance / s ** 2)
    
    def rbf_list(self, X, centroids, std_list):
        RBF_list = []
        for x in X:
            RBF_list.append([self.rbf(x, c, s) for (c, s) in zip(centroids, std_list)])
        return np.array(RBF_list)
    
    def fit(self):
    
        self.centroids, self.std_list = kmeans(self.X, self.k, max_iters = 1000)

        if not self.std_from_clusters:
            dMax = np.max([get_distance(c1, c2) for c1 in self.centroids for c2 in self.centroids])
            self.std_list = np.repeat(dMax / np.sqrt(2 * self.k), self.k)

        RBF_X = self.rbf_list(self.X, self.centroids, self.std_list)

        step = 0.0005
        w_new = np.zeros((8, 2))
        MSE_lst = np.array([])
        
        y_code = self.convert_to_one_hot(self.y, self.number_of_classes)
        print(np.shape(y_code))
                            
        for iteration in range(100):
            error = np.array([])
            
            w_old = w_new
            pred_y = np.matmul(RBF_X, w_old)
                        
            # pred_y = np.array([np.argmax(x) for x in pred_y])
                        
            error = np.subtract(pred_y, y_code)
                        
            MSE = (error**2).mean()
            MSE_lst = np.append(MSE_lst, MSE)
                 
            w_new = w_old - step * np.sum(error)
            
        print(w_new)
                 
        RBF_list_tst = self.rbf_list(self.tX, self.centroids, self.std_list)

        self.pred_ty = np.matmul(RBF_list_tst, w_new)

        self.pred_ty = np.array([np.argmax(x) for x in self.pred_ty])
        
        diff = self.pred_ty - self.ty

        print('Accuracy: ', len(np.where(diff == 0)[0]) / len(diff))

In [4]:
RBF_CLASSIFIER = RBF(train_data, train_data_labels, test_data, test_data_labels, num_of_classes = 2, k = 8,
                     std_from_clusters=False)

RBF_CLASSIFIER.fit()

K-MEANS:  1105
K-MEANS:  435
K-MEANS:  309
K-MEANS:  292
K-MEANS:  186
K-MEANS:  17
K-MEANS:  14
K-MEANS:  10
K-MEANS:  10
K-MEANS:  7
K-MEANS:  11
K-MEANS:  7
K-MEANS:  7
K-MEANS:  7
K-MEANS:  8
K-MEANS:  0
(469, 2)
[[-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]
 [-7.09814741e+42 -7.09814741e+42]]
Accuracy:  0.77
