[Colab Notebook](https://colab.research.google.com/drive/1rlgOJ6TXEM3WIlfeCmgb-h3wvkZ_xxEZ#scrollTo=KL-xWvxSwAqq)

In [None]:
!pip install susi

## Add .pkl files to your own GDrive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

## Getting the right PATH
When your GDrive is loaded, find the folder containing the .pkl files that were provided. Than right click the folder containing it and select `Copy path`.
Paste it in the cell below replacing the path that follows the `%cd` command

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/MAWD/mad-pickles
!ls

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import susi
import json
import pickle
import requests
import pandas as pd
from susi.SOMPlots import plot_nbh_dist_weight_matrix, plot_umatrix, plot_estimation_map, plot_som_histogram
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
output_notebook()

# Import Datasets

In [None]:
radio_data_path = 'radio.pkl'
fma_data_path = 'fma.pkl'
xenocanto_data_path = 'xenocanto.pkl'

radio_data = pickle.load(open(radio_data_path, 'rb'))
# /\.\/(\w*?)\//g
data_length = len(radio_data['features'])
radio_data['known'] = ['NaN'] * data_length
index = 0
for path in radio_data['Sample_audio']:
    if './known/' in path:
       radio_data['known'][index] = 'known'
    else:
        radio_data['known'][index] = 'unknown'
    index+=1

fma_data = pickle.load(open(fma_data_path, 'rb'))

xenocanto_data = pickle.load(open(xenocanto_data_path, 'rb'))

In [None]:
current_df = fma_data
data_length = len(current_df)
train_amount = math.floor(len(current_df) / 4) * 3
current_df_train = current_df.loc[0:train_amount-1]
train_data = current_df.loc[0:train_amount-1]
print(len(train_data))
train_data = [np.ravel(item) for item in train_data['mfcc']]
print(len(train_data))
test_data = current_df.loc[train_amount:data_length-1]
current_df_test = current_df.loc[train_amount:data_length-1]
print(len(test_data))
test_data = [np.ravel(item) for item in test_data['mfcc']]
print(len(test_data))



# Train the SOM

`If you want to use pre trained SOM skip to the` Load SOM `cell`

In [None]:
#@markdown ### 5 * sqrt(number of training samples)
#@markdown this returns "ideal" grid size

#@markdown but can be changed to try out 

train_data_length = len(train_data)

# for d in train_data:
#     print(d.shape)
# 5 * sqrt(number of training samples)
grid = math.ceil(5 * math.sqrt(train_data_length))
grid = math.floor(math.sqrt(grid))
print(grid)
# this returns "ideal" grid size

som = susi.SOMClustering(
    n_rows=grid,
    n_columns=grid,
    n_iter_unsupervised=10000
)
som.fit(train_data)
# print("SOM fitted!")

In [None]:
#@title Saving SOM { display-mode: "form" }
#@markdown use the following cell to save the som to your GDrive. Do not forget the `.pkl` extension, and make sure your runtime did not disconnect.
som_fma = som
filename = 'what_a_som.pkl' #@param {type:"string"}
with open(filename, 'wb') as _file:
    pickle.dump(som_fma, _file)

## Load SOM 

In [None]:
#@markdown Be sure to be in the correct folder!
som_path = 'what_a_som.pkl' #@param {type: "string"}
som = pickle.load(open(som_path, 'rb'))

## Show SOM

In [None]:
u_matrix = som.get_u_matrix()
plot_umatrix(u_matrix, grid, grid)
plt.show()

In [None]:
y = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
clusters = som.get_clusters(np.asarray(test_data[0:100]))
plt.scatter(x=[c[1] for c in clusters], y=[c[0] for c in clusters], alpha=0.2)
plt.gca().invert_yaxis()
plt.show()

In [None]:
# @title Preparing data for customized node visualization  { display-mode: "form" }
# @markdown 
# f = open('../data.json')
# json_data = json.load(f)
# print(json_data)
# type(json_data)
data_from_node = []
for x in range(grid):
    for y in range(grid):
        obj = {"x": x, "y": y}
        data = som.get_datapoints_from_node((x, y))
        obj["data"] = data
        obj["val"] = len(data)
        data_from_node.append(obj)

print(data_from_node)


def get_min_max(data):
    values_arr = [x["val"] for x in data]
    minimum = min(values_arr)
    maximum = max(values_arr)
    avg = np.average(values_arr)
    return [minimum, maximum, avg]


print(get_min_max(data_from_node))


# from https://stackoverflow.com/questions/1969240/mapping-a-range-of-values-to-another
def map(value, original_min, original_max, mapped_min, mapped_max):
    # Figure out how 'wide' each range is
    leftSpan = original_max - original_min
    rightSpan = mapped_max - mapped_min

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - original_min) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return round(mapped_min + (valueScaled * rightSpan))


def clamp(n, smallest, largest):
    return max(smallest, min(n, largest))



def make_color_palette(data):
    minmax = get_min_max(data)
    length = len(data)
    for i in range(length):
        item = data[i]
        r = map(item["val"], minmax[0], minmax[2], 0, 255)
        r = clamp(r, 0, 255)
        g = clamp(map(item["val"], minmax[0], minmax[2], 30, 85), 0, 255)
        b = clamp(map(item["val"], minmax[0], minmax[2], 0, 255), 0, 155)
        color = "#%02x%02x%02x" % (r, g, b)
        item["color-hex"] = color
        color = [r / 255, g / 255, b / 255]
        item["color-rgb"] = color


make_color_palette(data_from_node)
print(data_from_node)


In [None]:
# @title customized SOM visualization
X = [x['x'] for x in data_from_node]
Y = [x['y'] for x in data_from_node]
C = [x['color-rgb'] for x in data_from_node]
fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(X, Y, c = C)
plt.gca().invert_yaxis()
plt.show()

# Visualizing the SOM using Bokeh

In [None]:
# @title update dataframe
projection = som.transform(test_data)
X = [x[0] for x in projection]
Y = [x[1] for x in projection]

current_df_test['proj_x'] = X
current_df_test['proj_y'] = Y
proj = [[x[0], x[1]] for x in projection]
current_df_test['proj'] = proj

In [None]:
# @title Build Color palette utility { display-mode: "form" }
# @markdown This few line of code are used to build color palette
import random
def build_color_palette(unique):
    result = {}
    for name in unique:
        # Generating a random number in between 0 and 2^24
        color = random.randrange(0, 2**24)
        # Converting that number from base-10 (decimal) to base-16 (hexadecimal)
        hex_color = hex(color)
        std_color = "#" + hex_color[2:]
        result[name] = std_color
    print(result)
    return result

def assign_colors(df, palette, header):
    colors = []
    for item in df[header]: #this needs to be set in a variable
        color = palette[item]
        colors.append(color)
    df['colors'] = colors


In [None]:
# @title skip this DEPRECATED!
# extract all the nodes given the genres
# .drop_duplicates(subset=['brand'])
field = "Genre"
unique_values = current_df_test[field].unique()
projection_df = pd.DataFrame(columns=current_df_test.columns.to_list())
print(projection_df)
for value in unique_values:
    temp = current_df_test.loc[current_df_test[field] == value]
    temp = temp.drop_duplicates(subset=['proj'])
    projection_df = pd.concat([projection_df, temp], ignore_index=True)


In [None]:
# @title select the reference that will be displayed in the visualization
field = "Genre" #@param {type: "string"}
unique_values = current_df_test[field].unique()
print(len(unique_values))
palette = build_color_palette(unique_values)
assign_colors(current_df_test, palette, field)

In [None]:
# @title Make test data based on category { display-mode: "form" }
category = 'Rock' #@param {type:"string"}
test_data = current_df_test.loc[current_df_test['Genre'] == category]

In [None]:
# @title Make test data based on rows from the test dataset { display-mode: "form" }
start = 0 #@param {type:"number"}
end = 199 #@param {type:"number"}
if end > 999:
  end = 999
print(end)
test_data = current_df_test.loc[3000 + start:3000 + end]

In [None]:
# @title Build Visualization with Bokeh { display-mode: "form" }


# test_data = projection_df

'''
make a cell for the two lines below to have the
tst data be either specific rows
or specific Genre
'''

# test_data = current_df_test.loc[current_df_test['Genre'] == 'Rock']
# test_data = current_df_test.loc[3000:3699]

TITLE = "SOM Visualization"
TOOLS = "hover,pan,wheel_zoom,box_zoom,reset,save"



p = figure(tools=TOOLS, toolbar_location="above", title=TITLE)
p.toolbar.logo = "grey"
p.background_fill_color = "#efefef"
p.xaxis.axis_label = "X-axis"
p.yaxis.axis_label = "Y-axis"
p.grid.grid_line_color = "white"
p.hover.tooltips = [
    (field, "@" + field),
]


source = ColumnDataSource(test_data)

p.scatter("proj_x", "proj_y", size=12, source=source, color="colors",line_color="black", alpha=0.9)

labels = LabelSet(x="X-axis", y="Y-axis", text="symbol", y_offset=8,
                  text_font_size="11px", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

# show(p) # won't work in colab 😭
# @markdown Type here the name of plot
plotname = 'plot.html' #@param {type: "string"}
# @markdown This will be saved in your GDrive in the same folder where the .pkl files are stored

# @markdown Note that if you use the same file name the previous plot wil be overwritten
output_file(plotname)
save(p)


## TO DO 
* assign unique name for all datasets for what needs to be shown aka what is the equivalent of Genre for radio and xenocanto
* make projecting the other datasets possible
* add save functionality for colab integration from [here](https://docs.bokeh.org/en/latest/docs/reference/io.html#bokeh.io.save)

# Projecting Other Datasets

In [None]:
# projection_df[radio_data['mfcc'] != Nan]
projection_df = radio_data
projection_features = [np.ravel(item) for item in projection_df['mfcc']]
index = 0
nan_indexes = []
for f in projection_features:
    if np.isnan(f)[0] == True:
        nan_indexes.append(index)
    index+=1

print(nan_indexes)

projection_df = projection_df.drop(nan_indexes)
projection_features = np.delete(projection_features, nan_indexes, axis=0)
print(len(projection_features))
print(len(projection_df))

projection = som.transform(projection_features)
X = [x[0] for x in projection]
Y = [x[1] for x in projection]

projection_df['proj_x'] = X
projection_df['proj_y'] = Y
proj = [[x[0], x[1]] for x in projection]
projection_df['proj'] = proj


field = "known"
unique_values = projection_df[field].unique()
print(len(unique_values))
palette = build_color_palette(unique_values)
assign_colors(projection_df, palette, field)

In [None]:
# @title Build Visualization with Bokeh
# add a variable for the header to be used for the labels

test_data = projection_df
# test_data = current_df_test.loc[current_df_test['Genre'] == 'Rock']
# test_data = current_df_test.loc[3000:3699]

TITLE = "SOM Visualization"
TOOLS = "hover,pan,wheel_zoom,box_zoom,reset,save"



p = figure(tools=TOOLS, toolbar_location="above", width=1200, title=TITLE)
p.toolbar.logo = "grey"
p.background_fill_color = "#efefef"
p.xaxis.axis_label = "X-axis"
p.yaxis.axis_label = "Y-axis"
p.grid.grid_line_color = "white"
p.hover.tooltips = [
    (field, "@" + field),
]


source = ColumnDataSource(test_data)

p.scatter("proj_x", "proj_y", size=12, source=source, color="colors",line_color="black", alpha=0.9)

labels = LabelSet(x="X-axis", y="Y-axis", text="symbol", y_offset=8,
                  text_font_size="11px", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)



# Old Stuff will be removed soon

In [None]:
# @title Import Data
url_radio_data_noise = 'https://radioexplorations.ch/study-2/data/df_fma_entropy_radio.json'
url_radio_data_fingerprint = 'https://radioexplorations.ch/study-2/data/df_radio_data.json'
url_fma_data_noise = 'https://radioexplorations.ch/study-2/data/df_fma_entropy_data.json'
url_fma_data_fingerprint = 'https://radioexplorations.ch/study-2/data/df_small_data.json'

# get radio noise dataset
radio_data_noise = requests.get(url_radio_data_noise)
radio_data_noise = radio_data_noise.json()
radio_data_noise.pop('bmus-proj')
df_radio_noise = pd.DataFrame.from_dict(radio_data_noise)

# get radio fingerprint dataset
radio_data_fingerprint = requests.get(url_radio_data_fingerprint)
radio_data_fingerprint = radio_data_fingerprint.json()
radio_data_fingerprint.pop('bmus-proj')
df_radio_fingerprint = pd.DataFrame.from_dict(radio_data_fingerprint)

# get fma noise dataset
fma_data_noise = requests.get(url_fma_data_noise)
fma_data_noise = fma_data_noise.json()
df_fma_noise = pd.DataFrame.from_dict(fma_data_noise)

# get fma fingerprint dataset
fma_data_fingerprint = requests.get(url_fma_data_fingerprint)
fma_data_fingerprint = fma_data_fingerprint.json()
df_fma_fingerprint = pd.DataFrame.from_dict(fma_data_fingerprint)


In [None]:
# @markdown Display the trining data
fig, ax = plt.subplots(
    nrows=1, ncols=1, figsize=(12, 3.5), 
    subplot_kw=dict(xticks=[], yticks=[])
    )
shape_x = 50 # @param{type:"integer"}
shape_y = 60 # @param{type:"integer"}
ax.imshow(train_data.reshape(shape_x, shape_y, 3))
ax.title.set_text('Training Data')

In [None]:
# @title Project Data
# test_data = rand.randint(20, 200, (3000, 3))
# result = som.transform(test_data)


In [None]:
result = som.transform(train_data)

In [None]:
X = [x[0] for x in result]
Y = [x[1] for x in result]
C = test_data
fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(X, Y, c = C/255.0)
plt.show()