# Final Experiments

### Plotting

In [1]:
from enum import Enum

from matplotlib import gridspec
from lib.clustering_helpers import *
from lib.conceptors import *
from lib.experiment_helpers import *

"""
Problem-specific plotting
"""
class Plot:
    def __init__(self, x=10, y=10):
        plt.rcParams["figure.autolayout"] = True
        self.fig = plt.figure(figsize=(y,x))
        self.cnt = 0
        self.new_ax = None

    def add(self, y, label=None):
        if label != None:
            self.new_ax.plot(y, label=label)
            self.new_ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
                                 ncol = 2, fancybox=True, shadow=True,
                                 handleheight=2.4, labelspacing=0.05)
        else:
            self.new_ax.plot(y)

    def inc(self, ylabel="", xlabel=""):
        """Plots the data to a new subplot at the bottom."""
        self.cnt += 1
        gs = gridspec.GridSpec(self.cnt, 1)

        # Reposition existing subplots
        for i, ax in enumerate(self.fig.axes):
            ax.set_position(gs[i].get_position(self.fig))
            ax.set_subplotspec(gs[i])

        # Add new subplot
        self.new_ax = self.fig.add_subplot(gs[self.cnt-1])
        self.new_ax.set_xlabel(xlabel, fontsize=16)
        self.new_ax.set_ylabel(ylabel, fontsize=16)
        
        
    def add_new(self, y, label="No Label"):
        self.inc()
        self.add(y, label)

    def add_new_assignment_plot(self, assignments, labels=[], fuzzy=False, length=0, smoothness=1, xlabel="", ylabel=""):
        self.inc(ylabel, xlabel)
        for idx, ts in enumerate(assignments):
            if not fuzzy:
                y = []
                if length == 0:
                    length = max( [ max(ts) for ts in assignments if ts != [] ] )
                for t in range(length):
                    if t in ts:
                        y.append(1)
                    else:
                        y.append(0)
            else:
                y = ts
            if max(y) > 0:
                if labels == None:
                    self.add(smoothed(y,smoothness))
                elif labels == []:
                    self.add(smoothed(y, smoothness), idx)
                else:
                    self.add(smoothed(y, smoothness), str(labels[idx]))


    def add_new_conceptors_fit_plot(self, X, Cs, Ns=None, label="", labels="", smoothness=3):
        """
        Plots, for each time step t, how well each conceptor in Cs matches the state x(t)
        """
        self.inc()
        if Ns:
            collection = evidences_for_Cs(X,Cs,Ns)
        else:
            collection, _ = test(X, Cs, "PROP")
        if labels:
            for vals, label in zip(collection, labels):
                # walking average of d
                self.add(smoothed(vals, smoothness), label=label)
        else:                
            for i, vals in enumerate(collection):
                # walking average of d
                self.add(smoothed(vals, smoothness), label=label+str(i))


    def finalize(self, title=""):
        self.fig.suptitle(title, fontsize=16)
        plt.show()

### Collect training features

In [2]:
import numpy as np
import random
import os, sys
import warnings
import matplotlib.pyplot as plt
import pickle as pkl
from dataset.loading import DataLoader
from lib.esn import ESN
from lib.helpers import *
import pandas as pd

np.random.seed(0)
random.seed(0)
warnings.filterwarnings("ignore")

directory = os.path.abspath('/Users/joris/Documents/Work/bsc ai/bt/Bachelor-Thesis/code')
sys.path.append(directory)

save = True

data_dir = '../../dataset/'
cache_dir = '../../cache/'
dl = DataLoader(data_dir, cache_dir)

dr = []
descriptions = dl.read_descriptions("Train")
speakers = descriptions.speaker_id.unique()
df = pd.DataFrame({"speaker_id":speakers})
df['gender'] = df.speaker_id.str[0]
speakers = df.groupby("gender").sample(50, replace=False).speaker_id.values
sentence = "SX"

XorZ = "X"
long_version = False
n_mels = 14
delta = False
delta_delta = False
subsamples = 10
const_params = {
    "n_mels":n_mels,
    "XorZ":XorZ
}

path_option = "Final"+str(long_version)+str(n_mels)+str(delta)+str(delta_delta)+str(subsamples)

if dr:
    path_option = str(dr)+"_"+path_option
if len(speakers):
    path_option = str(speakers[0])+"_"+path_option
if sentence:
    path_option = sentence+"_"+path_option

features,labels,oversamplings = dl.collectFeaturesInSegments(
    n_mels=n_mels,delta=delta,delta_delta=delta_delta,
    long_version=long_version,speakers=speakers,dr=dr,
    sentence=sentence,subsamples=subsamples,path_option=path_option)

-from output
---- success


### Regroup data and subset phonemes

In [3]:
from dataset.data_helpers import *

#selected_labels = None
#selected_labels = ["aa", "b", "s", "iy"]
selected_labels = ["aa", "ae", "ah", "eh", "ih", "iy", "uh"]
#selected_labels = ["aa", "ae", "ah", "eh", "ih", "iy", "uh", "er", "ey", "ix", "aw", "axr", "l", "oy", "r", "y"]

phonemes, features, labels = filter_data(features, labels, selected_labels=selected_labels, limit=None)
group = group_by_labels(features, labels)
n_samples = len(features)
        
print(f"{str(len(phonemes))} phonemes: {phonemes}")

Filtered to 4223 samples of shape (10, 14)
7 phonemes: ['ih', 'ae', 'ah', 'iy', 'aa', 'eh', 'uh']


### Initialize Reservoir

In [4]:
XorZ = "X"

method = Method.OG_SIGNALS

esn_params = {
    "in_dim": n_mels*(1+delta+delta_delta),
    "out_dim": n_mels*(1+delta+delta_delta),
    "N": (XorZ=="X")*30+20,
    "W_in_scale": 1.1,
    "b_scale": .44,
    "spectral_radius": 2.57,
    "weights": .1

}

esn = ESN(esn_params)

## Kmeans (Below-phoneme)
### Compute conceptors

In [10]:
save = False
aperture = "auto"
normalize = True

Cs = compute_Cs(group, esn, aperture=aperture, normalize=normalize, XorZ=XorZ, cache=save, file_identifier=path_option)

target_sum = np.mean([sum_of_singular_vals(C) for C in Cs])

- computing conceptors
optimizing
Computing gammas...
Optimal gamma:  74.7710210370682
normalizing
Target:  31.83537995162493
std 0.42127845253181995


### Kmeans helpers

## Kmeans

In [99]:
def k_means(method, features, nb_clusters=4, max_epochs=15, new_assignments=None):

    if new_assignments == None:
        new_assignments = assign_to_clusters(n_samples, nb_clusters)
        #new_assignments = assign_to_clusters_smart(features, nb_clusters)

    for epoch in range(max_epochs):
        features_of_assignments = []
        for assignments in new_assignments:
            features_of_assignments.append([ features[a] for a in assignments ])

        centroids = compute_centroids(method, features_of_assignments)
        if method == Method.PRED_CENTROIDS:
            centroids2 = compute_centroids(Method.CENTROIDS, features_of_assignments)
        Ns = Ns_from_Cs(centroids) if method == Method.PRED or method == Method.PRED_CENTROIDS else None

        print("Epoch ",epoch,"# centroids:",len(centroids))

        old_assignments = new_assignments.copy()
        new_assignments = [ [] for _ in range(len(centroids)) ]

        for p, feature in enumerate(features):

            ### Find closest centroids

            if method == Method.PRED_CENTROIDS:
                ds_pred = find_distances_to_centroids(Method.PRED, p, feature, Cs, centroids, Ns)
                ds_centroid = find_distances_to_centroids(Method.CENTROIDS, p, feature, Cs, centroids2, Ns)
                ds = np.add(ds_pred, ds_centroid) / 2
            else:
                ds = find_distances_to_centroids(method, p, feature, Cs, centroids, Ns, normalize=False)
            centroid_index = np.argmin(ds)

            new_assignments[ centroid_index ].append(p)

        for new_assignment in new_assignments:
            stop = False
            for old_assignment in old_assignments:
                if set(new_assignment) == set(old_assignment):
                    stop = True
            if stop:
                print("Converged")
                return centroids, new_assignments
    return centroids, new_assignments


#Cs_kmeans, assignments_kmeans = k_means(method, kmeans_features,len(phonemes),100,smart_assignments)


## Single run

In [100]:
correct_assignments = [ [] for _ in group.values() ]
idx = 0
for i,vals in enumerate(group.values()):
    for val in vals:
        correct_assignments[i].append(idx)
        idx += 1

def single_run():
    method = Method.SIMS
    # if method is PRED or SIMS or CENTROIDS, Kmeans points are esn states
    # otherwise it's the MFCCs
    # Still, everything is passed
    esn_states = [ esn.run(feature.T, XorZ=XorZ) for feature in features ]

    save = False
    ass_name = "Smart_assignments"+file_name

    if save and os.path.exists('./cache/working/'+ass_name+'.pkl'):
        print("- from file")
        fp = open('./cache/working/'+ass_name+'.pkl','rb')
        smart_assignments = pkl.load(fp)
        fp.close()
        print("--- Done")
    else:
        print("- computing smart assignments")
        smart_assignments = assign_to_clusters_smart(Method.SIMS, kmeans_features, Cs, len(phonemes))
        if save:
            fp = open("./cache/working/"+ass_name+".pkl",'wb')
            pkl.dump(smart_assignments,fp)
            fp.close()
        print("--- Done")


    plot = Plot(10,10)

    print("Kmeans NMI: ", NMI(assignments_kmeans, correct_assignments))
    print("Baseline NMI: ", NMI(assign_to_clusters(len(Cs),len(phonemes)), correct_assignments))
    print("Lengths correct: ", [len(x) for x in correct_assignments])
    print("Lengths kmeans: ", [len(x) for x in assignments_kmeans])

    plot.add_new_assignment_plot(correct_assignments,phonemes,xlabel="Sample index",ylabel="Original class membership")
    plot.add_new_assignment_plot(assignments_kmeans,smoothness=200,xlabel="Sample index",ylabel="Hard cluster membership \n(averaged over 10 adjacent samples)")
    plot.finalize()

## Experiment

In [101]:
print(len(features))

def experiment(iterations=5):
    params = {
        "method" : [Method.PRED],#[Method.OG_SIGNALS, Method.CENTROIDS, Method.SIMS, Method.PRED],
        "smart" : [False]#[True, False]
    }
    results = []
    for method in params["method"]:
        for smart in params["smart"]:
            NMIs = []
            print("method", method, "smart", smart)
            # if method == Method.PRED or method == Method.SIMS or method == Method.PRED_CENTROIDS or method == Method.CENTROIDS:
            #     kmeans_features = [ esn.run(feature.T, XorZ=XorZ) for feature in features ]
            # else:
            #     kmeans_features = features
            esn_states = [ esn.run(feature.T, XorZ=XorZ) for feature in features ]
            for i in range(iterations):
                if smart:
                    smart_assignments = assign_to_clusters_smart(method, esn_states, features, Cs, len(phonemes))
                else:
                    smart_assignments = None
                Cs_kmeans, assignments_kmeans = k_means(method, esn_states, features, len(phonemes), 100, smart_assignments)
                NMIs.append(NMI(assignments_kmeans, correct_assignments))
            print("mean",np.mean(NMIs))
            results.append( {
                "method": method,
                "smart": smart,
                "NMI": np.mean(NMIs)
            } )
    return results

results = experiment(3)
print(results)

4223
method pred smart False
Target:  6.586318544556513
std 0.05698311988217593
Epoch  0 # centroids: 7
Target:  6.586318544556513
std 1.1865067052460925
Epoch  1 # centroids: 7
Target:  6.586318544556513
std 0.7750711195383068
Epoch  2 # centroids: 5
Target:  6.586318544556513
std 2.4179116987849865
Epoch  3 # centroids: 5
Target:  6.586318544556513
std 0.4155157633578382
Epoch  4 # centroids: 5
Target:  6.586318544556513
std 0.3920847960684362
Epoch  5 # centroids: 5
Target:  6.586318544556513
std 0.3699849970213977
Epoch  6 # centroids: 5
Target:  6.586318544556513
std 0.3737372749241185
Epoch  7 # centroids: 5
Target:  6.586318544556513
std 0.299330161297754
Epoch  8 # centroids: 5
Target:  6.586318544556513
std 0.3277640811934177
Epoch  9 # centroids: 5
Target:  6.586318544556513
std 0.32655308354393414
Epoch  10 # centroids: 5
Target:  6.586318544556513
std 0.31304953720144024
Epoch  11 # centroids: 5
Target:  6.586318544556513
std 0.3112885345638068
Epoch  12 # centroids: 5
Targ

Epoch  8 # centroids: 5
Target:  6.586318544556513
std 0.3277640811934177
Epoch  9 # centroids: 5
Target:  6.586318544556513
std 0.32655308354393414
Epoch  10 # centroids: 5
Target:  6.586318544556513
std 0.31304953720144024
Epoch  11 # centroids: 5
Target:  6.586318544556513
std 0.3112885345638068
Epoch  12 # centroids: 5
Target:  6.586318544556513
std 0.3285656756161993
Epoch  13 # centroids: 5
Target:  6.586318544556513
std 0.2906554778175866
Epoch  14 # centroids: 5
Target:  6.586318544556513
std 0.3363742839090951
Epoch  15 # centroids: 5
Target:  6.586318544556513
std 0.37684177383955336
Epoch  16 # centroids: 5
Target:  6.586318544556513
std 0.3794522649850406
Epoch  17 # centroids: 5
Target:  6.586318544556513
std 0.3224482397017585
Epoch  18 # centroids: 5
Target:  6.586318544556513
std 0.46128986909876313
Epoch  19 # centroids: 5
Target:  6.586318544556513
std 0.37295570864483385
Epoch  20 # centroids: 5
Target:  6.586318544556513
std 0.45924036074868163
Epoch  21 # centroids

Epoch  17 # centroids: 5
Target:  6.586318544556513
std 0.3224482397017585
Epoch  18 # centroids: 5
Target:  6.586318544556513
std 0.46128986909876313
Epoch  19 # centroids: 5
Target:  6.586318544556513
std 0.37295570864483385
Epoch  20 # centroids: 5
Target:  6.586318544556513
std 0.45924036074868163
Epoch  21 # centroids: 5
Target:  6.586318544556513
std 0.347631799933102
Epoch  22 # centroids: 5
Target:  6.586318544556513
std 0.44187227579201743
Epoch  23 # centroids: 5
Target:  6.586318544556513
std 0.4670106082106115
Epoch  24 # centroids: 5
Target:  6.586318544556513
std 0.42707812408746726
Epoch  25 # centroids: 5
Target:  6.586318544556513
std 0.4680687371150091
Epoch  26 # centroids: 5
Target:  6.586318544556513
std 0.42688908247422
Epoch  27 # centroids: 5
Target:  6.586318544556513
std 0.4652245206684984
Epoch  28 # centroids: 5
Target:  6.586318544556513
std 0.4254082827967339
Epoch  29 # centroids: 5
Target:  6.586318544556513
std 0.454813479090649
Epoch  30 # centroids: 5

In [57]:
print(len(features))

def experiment(iterations=5):
    params = {
        "method" : [Method.SIMS],#[Method.OG_SIGNALS, Method.CENTROIDS, Method.SIMS, Method.PRED],
        "smart" : [True]#[True, False]
    }
    results = []
    for method in params["method"]:
        for smart in params["smart"]:
            NMIs = []
            print("method", method, "smart", smart)
            if method == Method.PRED or method == Method.SIMS:
                kmeans_features = [ esn.run(feature.T, XorZ=XorZ) for feature in features ]
            else:
                kmeans_features = features
            for i in range(iterations):
                if smart:
                    smart_assignments = assign_to_clusters_smart(method, kmeans_features, Cs, len(phonemes))
                else:
                    smart_assignments = None
                Cs_kmeans, assignments_kmeans = k_means(method,kmeans_features,len(phonemes),100,smart_assignments)
                NMIs.append(NMI(assignments_kmeans, correct_assignments))
            print("mean",np.mean(NMIs))
            results.append( {
                "method": method,
                "smart": smart,
                "NMI": np.mean(NMIs)
            } )
    return results

results = experiment(1)
print(results)

427
method sims smart True
Finding centroid number  2


KeyboardInterrupt: 