In [None]:
%matplotlib inline
import numpy as np                                                                                                 
import matplotlib.pyplot as plt                                                                                    
import pandas as pd

import bc_utils as butils

from scipy.spatial import distance
from mpl_toolkits import mplot3d

from sklearn.model_selection import train_test_split                                                               
from sklearn.preprocessing import StandardScaler                                                                   
from sklearn.neighbors import KNeighborsClassifier 

# 1. Load data

In [None]:
# Read dataset to pandas dataframe
# current datasets ready for testing:
#   blobs.csv
#   blobs_3d.csv
#   iris.csv
#   mfeat-mor.csv
#   mfeat-zer.csv
#   mfeat-pix.csv
#   mfeat-fac.csv
#   mfeat-fou.csv
#   mfeat-kar.csv
#   noisy_circles.csv
#   noisy_circles_3d.csv
#   overlap.csv
#   pendigit.csv

df = pd.read_csv("data/pendigit.csv", index_col=0)

# 2. Plot data

Visualize data if they have 2 or 3 features.

In [None]:
grouped = df.groupby('cluster')
for group in grouped:
    print(group)

In [None]:
colors = {0:'red', 1:'blue', 2:'green'}

if (df.columns.size == 3):
    fix, ax = plt.subplots()
    grouped = df.groupby('cluster')
    
    i = 0
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x='x', y='y',
                   label=key, color=colors[i])
        i = i + 1
    plt.show()
elif (df.columns.size == 4):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    grouped = df.groupby('cluster')

    i = 0
    for key, group in grouped:
        ax.scatter(group['x'], group['y'], group['z'], color=colors[i])
        i = i + 1
    plt.show()
else:
    print("There is no good visualisation for this dataset - " + str(df.columns.size) + " features.")
    

In [None]:
#creating test datasets
names = set(df['cluster'])

dataframes = {}
for name in names:
    tmp_df = df[df['cluster'] == name]
    dataframes[name] = butils.TestDf(tmp_df)
    
#all data for testing
full_test_df = pd.DataFrame() 
for name in names:
    full_test_df = full_test_df.append(dataframes[name].test_df)

In [None]:
test_res = {"delta_medoids_full" : {},
            "delta_medoids_one_shot" : {},
            "random_select" : {},
            "greedy_select" : {}}

#creating training DataFrames for comparing oneshot and full delta medoids algorithm
train_delta_medoids_full = pd.DataFrame()
train_delta_medoids_one_shot = pd.DataFrame()
train_random_selection = pd.DataFrame()
train_greedy_select = pd.DataFrame()


for name in names:
    delta_df = dataframes[name].train_df.iloc[:, :-1]
    
    #delta medoids full
    medoids_full_result = butils.delta_medoids_full(delta_df, butils.estimate_delta(delta_df, distance.euclidean) , distance.euclidean)
    medoids_full_result['cluster'] = name #setting a cluster name for result
    test_res["delta_medoids_full"][name] = medoids_full_result
    train_delta_medoids_full = train_delta_medoids_full.append(medoids_full_result)

    
    #delta medoids one shot
    one_shot_medoids_result = butils.delta_medoids_one_shot(delta_df, butils.estimate_delta(delta_df, distance.euclidean), distance.euclidean)
    one_shot_medoids_result['cluster'] = name
    test_res["delta_medoids_one_shot"][name] = one_shot_medoids_result
    train_delta_medoids_one_shot = train_delta_medoids_one_shot.append(one_shot_medoids_result)
    
    #random select
    random_select_result = butils.random_select(delta_df, medoids_full_result.shape[0])
    random_select_result['cluster'] = name
    test_res["random_select"][name] = random_select_result
    train_random_selection = train_random_selection.append(random_select_result)
    
    #greedy select
    greedy_select_result = butils.greedy_select(delta_df, butils.estimate_delta(delta_df, distance.euclidean), distance.euclidean)
    greedy_select_result['cluster'] = name
    test_res["greedy_select"][name] = greedy_select_result
    train_greedy_select = train_greedy_select.append(greedy_select_result)    
    
matrix_full = butils.classifyPoints(train_delta_medoids_full, full_test_df)
matrix_one_shot = butils.classifyPoints(train_delta_medoids_one_shot, full_test_df)
matrix_random_selection = butils.classifyPoints(train_random_selection, full_test_df)
matrix_greedy_select = butils.classifyPoints(train_greedy_select, full_test_df)

In [None]:
precision_results = {"full_medoids" : butils.get_hit_miss_rate(matrix_full),
                     "one_shot_medoids" : butils.get_hit_miss_rate(matrix_one_shot),
                     "random_select" : butils.get_hit_miss_rate(matrix_random_selection),
                     "greedy_select" : butils.get_hit_miss_rate(matrix_greedy_select)}

print("Rates for each method:")
minimal_rate = ""
for key in precision_results:
    print(key + ": " + str(precision_results[key]))
    if precision_results[key] < minimal_rate:
        minimal_rate = key
        
print("Method that got the best result is " + minimal_rate)

In [None]:
def benchmark_result_plots(matrices, titles, headers):
    """
    parameters:
    matrices - list of 2d np.array
    titles - list of graph titles
    names - list of labels
    """
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))
    ax_list = [ax1, ax2, ax3, ax4]

    for index in range(0, len(matrices)):
        tick_labels = sorted(list(headers[index]))
        ax = ax_list[index]
        ax.imshow(matrices[index], cmap='binary')

        # We want to show all ticks...
        ax.set_xticks(np.arange(len(tick_labels)))
        ax.set_yticks(np.arange(len(tick_labels)))

        # ... and label them with the respective list entries
        ax.set_xticklabels(tick_labels)
        ax.set_yticklabels(tick_labels)

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), ha="right")

        # Loop over data dimensions and create text annotations.
        for i in range(len(tick_labels)):
            for j in range(len(tick_labels)):
                if(matrices[index][i, j] == 0):
                    continue
                if i == j:
                    text = ax.text(j, i, matrices[index][i, j], ha="center", va="center", color="g",
                                   weight="bold", fontsize=16)
                    continue
                else:
                    text = ax.text(j, i, matrices[index][i, j], ha="center", va="center", color = "r",
                                   weight="bold", fontsize=16)
            
        ax.set_title(titles[index], fontsize=16)
        
benchmark_result_plots([matrix_full, matrix_one_shot, matrix_random_selection, matrix_greedy_select],
                       ['Delta-Medoids Full', 'Delta-Medoids One Shot', 'Random Selection', 'Greedy Selection'],
                       [names, names, names, names])