<a href="https://colab.research.google.com/github/jazoza/mad/blob/main/04_MAD_SOM_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Making Arguments with Data

Experiments with clustering and projecting

## Installing and importing necessary packages

In [None]:
!pip install susi

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import susi
import json
import pickle
import requests
import pandas as pd
from susi.SOMPlots import plot_nbh_dist_weight_matrix, plot_umatrix, plot_estimation_map, plot_som_histogram
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
output_notebook()

## Add .pkl files to your own GDrive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

## Getting the path to files
When your GDrive is loaded, find the folder containing the .pkl files that were provided. Than right click the folder containing it and select `Copy path`.
Paste it in the cell below replacing the path that follows the `%cd` command

In [None]:
! pwd #check working directory, find where your files are
%cd /content/drive/MyDrive/
!ls

In [None]:
folder_path = '/content/drive/MyDrive/datasets-SOM/' # set the path to your files

In [None]:
radio_data_path = folder_path+'radio.pkl'
fma_data_path = folder_path+'fma.pkl'
xenocanto_data_path = folder_path+'xenocanto.pkl'

radio_data = pickle.load(open(radio_data_path, 'rb'))
# /\.\/(\w*?)\//g
data_length = len(radio_data['features'])
radio_data['known'] = ['NaN'] * data_length
index = 0
for path in radio_data['Sample_audio']:
    if './known/' in path:
       radio_data['known'][index] = 'known'
    else:
        radio_data['known'][index] = 'unknown'
    index+=1

fma_data = pickle.load(open(fma_data_path, 'rb'))

xenocanto_data = pickle.load(open(xenocanto_data_path, 'rb'))

datasets = {
    "fma": fma_data,
    "radio": radio_data,
    "xenocanto": xenocanto_data,
}

In [None]:
fma_data

## Load SOM

In [None]:
# @title Choose the data you want to work with
# @markdown `You can select between FMA, Xenocanto and Radio`
selector = "fma" # @param ["fma", "radio", "xenocanto"]
current_df = datasets[selector]
data_length = len(current_df)
train_amount = math.floor(len(current_df) / 4) * 3
current_df_train = current_df.loc[0:train_amount-1]
train_data = current_df.loc[0:train_amount-1]
train_data = [np.ravel(item) for item in train_data['mfcc']]
test_data = current_df.loc[train_amount:data_length-1]
current_df_test = current_df.loc[train_amount:data_length-1]
test_data = [np.ravel(item) for item in test_data['mfcc']]

In [None]:
# update dataframe
projection = som.transform(test_data)
X = [x[0] for x in projection]
Y = [x[1] for x in projection]

current_df_test['proj_x'] = X
current_df_test['proj_y'] = Y
proj = [[x[0], x[1]] for x in projection]
current_df_test['proj'] = proj

In [None]:
#@markdown `Be sure to be in the correct folder!`
som_path = folder_path+'som_fma.pkl' #@param {type: "string"}
som = pickle.load(open(som_path, 'rb'))

## Visualizing the SOM using Bokeh

In [None]:
# @title Build Color palette utility { display-mode: "form" }
# @markdown This few line of code are used to build color palette
import random
def build_color_palette(unique):
    result = {}
    for name in unique:
        if type(name).__name__ != 'str':
          if math.isnan(float(name)):
            name = "Unknown"
        # Generating a random number in between 0 and 2^24
        color = random.randrange(0, 2**24)
        # Converting that number from base-10 (decimal) to base-16 (hexadecimal)
        hex_color = hex(color)
        std_color = "#" + hex_color[2:]
        result[name] = std_color
    print(result)
    return result

def assign_colors(df, palette, header):
    colors = []
    for item in df[header]: #this needs to be set in a variable
        if type(item).__name__ != 'str':
          if math.isnan(float(item)):
            item = "Unknown"
        color = palette[item]
        colors.append(color)
    df['colors'] = colors


In [None]:
# @title Make test data based on multiple categories { display-mode: "form" }
# @markdown `please insert the values seprated by a space " "`
category = 'Rock Instrumental Folk' #@param {type:"string"}
category = category.split()
print(category)
test_data = current_df_test.loc[current_df_test[field].isin(category)]

In [None]:
# @title Make test data based on rows from the test dataset { display-mode: "form" }
# @markdown `depending on the dataset you might have smaller boundaries`

# @markdown `so to say less than 199 datapoints`
start = 0 #@param {type:"number"}
end = 199 #@param {type:"number"}
if end > 999:
  end = 999
print(end)
test_data = current_df_test.loc[train_amount + start:train_amount + end]

### Build Visualization with Bokeh

In [None]:
# Select the field that will be displayed in the visualization
field = "Genre" #@param {type: "string"}
unique_values = current_df_test[field].unique()
print(len(unique_values))
palette = build_color_palette(unique_values)
assign_colors(current_df_test, palette, field)

In [None]:
TITLE = "SOM Visualization"
TOOLS = "hover,pan,wheel_zoom,box_zoom,reset,save"


p = figure(tools=TOOLS, toolbar_location="above", title=TITLE)
p.toolbar.logo = "grey"
p.background_fill_color = "#efefef"
p.xaxis.axis_label = "X-axis"
p.yaxis.axis_label = "Y-axis"
p.grid.grid_line_color = "white"
p.hover.tooltips = [
    (field, "@" + field),
]

source = ColumnDataSource(test_data)

p.scatter("proj_x", "proj_y", size=12, source=source, color="colors",line_color="black", alpha=0.9)

labels = LabelSet(x="X-axis", y="Y-axis", text="symbol", y_offset=8,
                  text_font_size="11px", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

# @markdown Type here the name of plot
plotname = folder_path+'visualisations/index.html' #@param {type: "string"}
# @markdown This will be saved in your GDrive in the same folder where the .pkl files are stored

# @markdown `Note that if you use the same file name the previous plot wil be overwritten`
output_file(plotname)
save(p)

## Projecting Other Datasets

In [None]:
# @title Prepare the Dataset to be projected
# @markdown Select the Dataset to project on the SOM
selector = "xenocanto" # @param ["fma", "radio", "xenocanto"]
projection_df = datasets[selector]
projection_features = [np.ravel(item) for item in projection_df['mfcc']]
index = 0
nan_indexes = []
for f in projection_features:
    if np.isnan(f)[0] == True:
        nan_indexes.append(index)
    index+=1

#print(nan_indexes)

projection_df = projection_df.drop(nan_indexes)
projection_features = np.delete(projection_features, nan_indexes, axis=0)
#print(len(projection_features))
#print(len(projection_df))

projection = som.transform(projection_features)
X = [x[0] for x in projection]
Y = [x[1] for x in projection]

projection_df['proj_x'] = X
projection_df['proj_y'] = Y
proj = [[x[0], x[1]] for x in projection]
projection_df['proj'] = proj

In [None]:
# @title List all the Columns of Projection Dataset { display-mode: "form" }
projection_df.columns.values.tolist()

In [None]:
# @title Choose the field { display-mode: "form" }
projection_field = "en" # @param {type: "string"}

# drop all the nans!!!

unique_values = projection_df[projection_field].unique()
# print(unique_values)
palette = build_color_palette(unique_values)
assign_colors(projection_df, palette, projection_field)

In [None]:
# @title Build Visualization with Bokeh
# add a variable for the header to be used for the labels

test_data = projection_df
# test_data = current_df_test.loc[current_df_test['Genre'] == 'Rock']
# test_data = current_df_test.loc[3000:3699]

TITLE = "SOM Visualization"
TOOLS = "hover,pan,wheel_zoom,box_zoom,reset,save"



p = figure(tools=TOOLS, toolbar_location="above", width=1200, title=TITLE)
p.toolbar.logo = "grey"
p.background_fill_color = "#efefef"
p.xaxis.axis_label = "X-axis"
p.yaxis.axis_label = "Y-axis"
p.grid.grid_line_color = "white"
# here we can add more fields to show in bokeh visualization
# needs to be added to the side as input param

# @markdown `add multiple fields to be shown on hovering as space " " separated list`
hover_list = 'gen date' # @param {type:"string"}
hover_list = hover_list.split()
hover_list.append(projection_field)
hover_list.append('colors')
tooltips = []
for item in hover_list:
  tooltips.append(
      (item, "@" + item)
  )

print(tooltips)
p.hover.tooltips = [
    (projection_field, "@" + projection_field),
    ("gen", "@gen"),
    ("date", "@date")
]

p.hover.tooltips = tooltips


source = ColumnDataSource(test_data)

p.scatter("proj_x", "proj_y", size=12, source=source, color="colors",line_color="black", alpha=0.9)

labels = LabelSet(x="X-axis", y="Y-axis", text="symbol", y_offset=8,
                  text_font_size="11px", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

# show(p)
# @markdown Type here the name of plot
plotname = folder_path+'visualisations/projection_plot.html' #@param {type: "string"}
# @markdown This will be saved in your GDrive in the same folder where the .pkl files are stored

# @markdown `Note that if you use the same file name the previous plot wil be overwritten`
output_file(plotname)
save(p)
