In [None]:
%matplotlib inline
import numpy as np                                                                                                 
import matplotlib.pyplot as plt                                                                                    
import pandas as pd

import bc_utils as butils

from scipy.spatial import distance 
from mpl_toolkits import mplot3d

from sklearn.model_selection import train_test_split                                                               
from sklearn.preprocessing import StandardScaler                                                                   
from sklearn.neighbors import KNeighborsClassifier 

# 1. Load data

In [None]:
# Read dataset to pandas dataframe
# current datasets ready for testing:
#   blobs.csv
#   blobs_3d.csv
#   iris.csv
#   mfeat-mor.csv
#   mfeat-zer.csv
#   mfeat-pix.csv
#   mfeat-fac.csv
#   mfeat-fou.csv
#   mfeat-kar.csv
#   noisy_circles.csv
#   noisy_circles_3d.csv
#   overlap.csv
#   pendigit.csv
#   moons.csv

df = pd.read_csv("data/overlap.csv", index_col=0)

# 2. Plot data

Visualize data if they have 2 or 3 features.

In [None]:
grouped = df.groupby('cluster')
for group in grouped:
    print(group)

In [None]:
colors = {0:'red', 1:'blue', 2:'green'}

if (df.columns.size == 3):
    fix, ax = plt.subplots()
    grouped = df.groupby('cluster')
    
    i = 0
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x='x', y='y',
                   label=key, color=colors[i])
        i = i + 1
    plt.show()
elif (df.columns.size == 4):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    grouped = df.groupby('cluster')

    i = 0
    for key, group in grouped:
        ax.scatter(group['x'], group['y'], group['z'], color=colors[i])
        i = i + 1
    plt.show()
else:
    print("There is no good visualisation for this dataset - " + str(df.columns.size) + " features.")
    

In [None]:
#creating test datasets
names = set(df['cluster'])

dataframes = {}
for name in names:
    tmp_df = df[df['cluster'] == name]
    dataframes[name] = butils.TestDf(tmp_df)
    
#all data for testing
full_test_df = pd.DataFrame()

for name in names:
    full_test_df = full_test_df.append(dataframes[name].test_df)

In [None]:
def benchmark_result_plots(matrices, titles, headers, rotation_num):
    """
    parameters:
    matrices - list of 2d np.array
    titles - list of graph titles
    names - list of labels
    rotation_num - number of rotation - for generating png files
    """
    
    fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(20,30))
    ax_list = [ax1, ax2, ax3, ax4, ax5, ax6]
    ax6.set_visible(False)
    
    for index in range(0, len(matrices)):
        tick_labels = list(headers[index])
        ax = ax_list[index]
        ax.imshow(matrices[index], cmap='binary')

        # We want to show all ticks...
        ax.set_xticks(np.arange(len(tick_labels)))
        ax.set_yticks(np.arange(len(tick_labels)))

        # ... and label them with the respective list entries
        ax.set_xticklabels(tick_labels, fontsize=16)
        ax.set_yticklabels(tick_labels, fontsize=16)

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), ha="right")

        # Loop over data dimensions and create text annotations.
        for i in range(len(tick_labels)):
            for j in range(len(tick_labels)):
                if(matrices[index][i, j] == 0):
                    continue
                if i == j:
                    text = ax.text(j, i, matrices[index][i, j], ha="center", va="center", color="g",
                                   weight="bold", fontsize=16)
                    continue
                else:
                    text = ax.text(j, i, matrices[index][i, j], ha="center", va="center", color = "r",
                                   weight="bold", fontsize=16)
            
        ax.set_title(titles[index], fontsize=32)
    plt.savefig('experiments/exp1/overlap_updated_' + str(rotation_num) + '.png')
        

In [None]:
for i in range(0,5):
    matrices = butils.get_method_results(full_test_df, dataframes, distance.euclidean, None)

    matrix_full = matrices[0]
    matrix_one_shot = matrices[1]
    matrix_random_selection = matrices[2]
    matrix_greedy_select = matrices[3]
    matrix_modified = matrices[4]

    precision_results = {"full_medoids" : butils.get_hit_miss_rate(matrix_full),
                         "one_shot_medoids" : butils.get_hit_miss_rate(matrix_one_shot),
                         "random_select" : butils.get_hit_miss_rate(matrix_random_selection),
                         "greedy_select" : butils.get_hit_miss_rate(matrix_greedy_select),
                         "medoids_modified" : butils.get_hit_miss_rate(matrix_modified)}

    print("Rates for each method:")
    evaluation_sum = 0
    minimal_rate = ""
    for key in precision_results:
        print(key + ": " + str(precision_results[key]))
        if precision_results[key] < minimal_rate:
            minimal_rate = key
        
    print("Method that got the best result is " + minimal_rate)
    evaluation_sum = evaluation_sum + precision_results[minimal_rate]
    
    graph_names = ['A', 'B', 'C']
    print("Creating plots.")
    benchmark_result_plots([matrix_modified, matrix_full, matrix_one_shot, matrix_random_selection, matrix_greedy_select],
                       ['Delta-Medoids Modified', 'Delta-Medoids Full', 'Delta-Medoids One Shot', 'Random Selection', 'Greedy Selection'],
                       [graph_names, graph_names, graph_names, graph_names, graph_names], i)
    
    for key in dataframes:
        dataframes[key].rotate()
    print("\n\nDataset Rotated!\n")