In [3]:
# General imports
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import normalized_mutual_info_score 
from scipy.spatial import distance, distance_matrix
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

#### Import Data

In [4]:

# KDD Dataset
# First column is the BLOCK ID (class label), BLOCK IDs are integers running from 1 to 303 with 153 unique values (k)
# Second column is the ELEMENT ID (sample number), unique numbers, not ordered
# Third column is the class of the example. Homologous proteins = 1, non-homologous proteins = 0
data = pd.read_csv('bio_train.csv',skiprows=0).to_numpy(dtype='object')

#####################
# Toy Dataset
#dataset = np.genfromtxt('dataset1_noCluster7.csv', delimiter = ',')[1:]
#dataset_ft = dataset[:,:2]
#dataset_lb = dataset[:,-1]

#scaler = StandardScaler()
#dataset_ft = scaler.fit_transform(dataset_ft)

#plt.scatter(dataset_ft[:,0], dataset_ft[:,1], c = dataset_lb)
#plt.show()

#### Data Preprocessing

In [5]:
# Shuffle, split into labels/features and normalize data
def process_data(data):
    # Shuffle
    shuffle = np.random.permutation(len(data))
    data = data[shuffle]
    
    # Split
    block_ids = data[:,0]
    element_ids = data[:,1]
    homology = data[:,2]
    features = data[:,3:]
    
    # Normalize the features
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    
    return block_ids, element_ids, features

In [6]:
# preprocess the data
block_ids, element_ids, features = process_data(data)
features = features.astype(np.float64) # only important if no normalization is done

'\nfor j in range(len(features)):\n    for i in range(len(features[0])):\n        if not isinstance(features[j][i], float):\n            print(features[j][i], type(features[j][i]))\n'

### Task 1 - Lloyds Algorithm

In [7]:
def lloyds(data, k=153):
    # Number of samples and features of the dataset
    
    n_samples, n_features = np.shape(data)
    
    # Pick k random points from data to be the initial cluster centers (eventually use kmeans+ here?)
    #rand_nums = np.random.randint(0,n_samples,k)
    #cluster_means = data[rand_nums]
    
    # pick the first k points as initial cluster means
    cluster_means = data[:k]
    
    old_means = np.zeros([k, n_features])
    counter = 0
    
    while (old_means != cluster_means).any():
    
        counter += 1
        old_means = np.copy(cluster_means)
        
        # avoiding endless loop
        if counter == 1000:
            break
            
        # printing progress
        if counter % 20 == 0: 
            print("iteration: ",counter)

            
        # measure assingment runtime
        #start_assign = time.perf_counter()
        
        ############# Assign step
        
        distance_matrix = cdist(data,cluster_means, metric='sqeuclidean')
        cluster_labels = np.argmin(distance_matrix, axis=1)
        
        ############# Update step
        for j in range(k):
            
            idcs = np.where(cluster_labels == j)[0]
            cluster_size = len(idcs)
            
            if cluster_size > 0:
                cluster_sum = np.sum(data[idcs], axis=0)
                cluster_means[j] = cluster_sum/cluster_size
            
            
        #end_update = time.perf_counter()
        
        #print('Assign step runtime: '+str(start_update - start_assign))
        #print('Update step runtime: '+str(end_update - start_update))
        
    print('KMeans converged in '+str(counter)+' iterations.')
    return cluster_labels, cluster_means

In [8]:
labels, centers = lloyds(features)
#labels, centers = lloyds(dataset_ft, 7)

iteration:  20
iteration:  40
iteration:  60
iteration:  80
iteration:  100
iteration:  120
iteration:  140
iteration:  160
iteration:  180
iteration:  200
iteration:  220
iteration:  240
iteration:  260
iteration:  280
iteration:  300
iteration:  320
iteration:  340
iteration:  360
iteration:  380
iteration:  400
iteration:  420
KMeans converged in 430 iterations.


In [7]:
#plt.scatter(dataset_ft[:,0], dataset_ft[:,1], c = labels)
#plt.scatter(centers[:,0], centers[:,1], c='r')
#plt.show()

In [14]:
NMI_score = normalized_mutual_info_score(block_ids, labels)
print(NMI_score)

0.18974373867283634


### Task 2 - LSH + Kmeans

In [21]:
# this function defines a hash function according to the notes on LSH + Kmeans and assigns 
# the samples to the buckets.
# There is still a mistake in this function

def hash_simple(data, no_buckets):
    
    no_samples = len(data)
    
    hash_values = np.zeros(no_samples)
    
    vector_p = np.random.normal(loc=0.0, scale=1.0, size=len(data[0]))
    
    for i in range(n):
        hash_values[i] = data[i].dot(vector_p)
        
    min_val = np.min(hash_values)
    max_val = np.max(hash_values)
    
    bucket_size = (max_val-min_val) / (no_buckets-1)
    
    return np.floor(hash_values/bucket_size)

In [65]:
# This class defines a given number of hash functions. 
# The calculate_hash_values function can be used to calclueate the hash values of any array with no_features as the second dimension
class hash_functions:
    def __init__(self, no_functions, w, no_features):
        self.w = w #scalar
        self.b = np.zeros(no_functions) # vector
        self.a = np.random.normal(loc=0.0, scale=1.0, size=(no_functions, no_features)) # matrix

    def calculate_hash_values(self,data):
        hash_values = (np.dot(data, self.a.T) + self.b) / self.w
        return hash_values
        

In [66]:

# define hash functions
no_features = len(features[0])
w=3
hash_funcs = hash_functions(16, w, no_features)

#calculate the 16 hash values from all features
hash_values = hash_funcs.calculate_hash_values(features)


