In [1]:
%cd ..
#%load_ext autoreload
#%autoreload 2
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import librosa
import numpy as np
from glob import glob
import os
from tqdm import tqdm
from evoaudio.population import Population
from evoaudio.sample_library import SampleLibrary
from evoaudio.feature_extraction import extract_features_for_window, extract_features_for_windows


c:\Users\justi\coding\Uni\MA


  from .autonotebook import tqdm as notebook_tqdm


### Population to feature vector of shape (n_features)

"The following statistics are proposed as approximative features.  
For each instrument and each onset in the approximated
music track, we save the smallest distance between the best
candidate mixture which contains this instrument and the onset
to approximate.  
(pop.best_collections_per_onset)  
  
These smallest distances are kept during the
complete evolutionary loop in an archive and do not represent
the final population only. (TODO!) 
    
Then, we estimate the mean, the
minimum, and the maximum values for each of 51 instrument
and 88 theoretically possible pitches for two different analysis
frames of 10s and 3s. Additionally, we sort the recognised
instruments based on the smallest distances, and assign ranks
to corresponding approximative features, e.g., value of “rang
of acoustic guitar” = 1 means that acoustic guitar had the
smallest mean distance between approximations with this
instrument and unknown onsets in the analysis frame. This
leads to an overall number of feature dimensions equal to
(51 · 3 + 88 · 3 + 51) · 2 = 936."

In [2]:
pop = Population.from_file("30k_gen_nutcracker.pkl")
target, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
lib = SampleLibrary()

Loading samples: 100%|██████████| 6478/6478 [00:16<00:00, 403.26it/s]


In [3]:
## Choose two different analysis frames of 10s and 3s
end_offset = 10 * sr
possible_onsets = len(target) - end_offset
window_start = np.random.randint(low=0, high=possible_onsets)
window_end = window_start + end_offset

## Grab the onsets in those windows, and the associated best records from the population
relevant_collections = [collection for collection in pop.best_collections_per_onset.values() if collection.onset in range(window_start, window_end)]

## For each window separately, calculate the maximum, minimum, and mean fitnesses for each occasion of
## An instrument
instrument_occurrences_fitness = dict()
## A pitch
pitch_occurrences_fitness = dict()

for collection in relevant_collections:
    for sample in collection.samples:
        if sample.instrument in instrument_occurrences_fitness:
            instrument_occurrences_fitness[sample.instrument].append(collection.fitness)
        else:
            instrument_occurrences_fitness[sample.instrument] = [collection.fitness]
        if sample.pitch in pitch_occurrences_fitness:
            pitch_occurrences_fitness[sample.pitch].append(collection.fitness)
        else:
            pitch_occurrences_fitness[sample.pitch] = [collection.fitness]

# print(instrument_occurrences_fitness)
# print(pitch_occurrences_fitness)

instrument_min = {instrument: np.min(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}
instrument_max = {instrument: np.max(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}
instrument_mean = {instrument: np.mean(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}

pitch_min = {pitch: np.min(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}
pitch_max = {pitch: np.max(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}
pitch_mean = {pitch: np.mean(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}

## Finally, give each instrument a rank from 1 to n_instruments, based on their mean distances (smallest = rank 1, highest = rank n_instruments)
# instrument_sort = np.argsort([instrument_mean[instrument] for instrument in instrument_mean])
instrument_sort = {k: v for k, v in sorted(instrument_mean.items(), key=lambda item: item[1])}
instrument_ranks = {instrument: i + 1 for i, instrument in enumerate(instrument_sort)}

# Create feature vector
instrument_features = []
for instrument_info in lib.instruments:
    instr_name = instrument_info.name
    if instr_name in instrument_ranks:
        instrument_features.append([instrument_min[instr_name], instrument_mean[instr_name], instrument_max[instr_name], instrument_ranks[instr_name]])
    else:
        instrument_features.append([np.inf, np.inf, np.inf, np.inf])
pitch_features = []
for pitch in lib.pitches:
    if pitch in pitch_min:
        pitch_features.append([pitch_min[pitch], pitch_mean[pitch], pitch_max[pitch]])
    else:
        pitch_features.append([np.inf, np.inf, np.inf])
flat_instr_features = np.array(instrument_features).flatten()
flat_pitch_features = np.array(pitch_features).flatten()
features = np.concatenate((flat_instr_features, flat_pitch_features))
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")


467 features generated from 50 instruments and 89 pitches.


Single Window

In [6]:
end_offset = 10 * sr
possible_onsets = len(target) - end_offset
window_start = np.random.randint(low=0, high=possible_onsets)
window_end = window_start + end_offset
features = extract_features_for_window(pop, lib, window_start, window_end)
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")

467 features generated from 50 instruments and 89 pitches.


Multiple Windows

In [4]:
features = extract_features_for_windows(pop=pop, lib=lib, window_lengths=[3, 10], n_total_samples=len(target), sr=sr)
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")

934 features generated from 50 instruments and 89 pitches.


In [2]:
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [5]:
X = [[0], [1], [2]]
y = ["a", "b", "c"]
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[2]]))

['c']


### Training of Random Forests from Saved Populations

In [2]:
lib = SampleLibrary()

Loading samples: 100%|██████████| 6826/6826 [00:09<00:00, 688.63it/s] 


For 10k generations

In [3]:
from glob import glob
import os

from tqdm import tqdm
# Load populations, Load Songs
all_popfiles = glob("./experiments/1517_artists/300_1_10000_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
popfiles = [file.replace("Hip-Hop", "Hip_Hop") for file in all_popfiles if "500gens" not in file and ".logger" not in file]
labels = [os.path.basename(file.split("-")[0]) for file in popfiles]
songnames = [file.split("-", 1)[1].split(".")[0] for file in popfiles]
# all_soundfiles = glob("./audio/1517-Artists/**/*.mp3", recursive=True)
# soundfiles = []
# for name in songnames:
#     for file in all_soundfiles:
#         if name in file:
#             soundfiles.append(file)
#             break
# lib = SampleLibrary()
pops = [Population.from_file(popfile.replace("Hip_Hop", "Hip-Hop")) for popfile in popfiles]
# songs = [librosa.load(soundfile)[0] for soundfile in soundfiles]
sr = 22050

# Separate the archive into 10s and 3s chunks

# Create feature vectors for pops for windows of 10s and 3s across the song
def pop_to_window_vectors(pop:Population, song_length:int, window_length_s:int, lib:SampleLibrary, sr:int=22050):
    window_length = sr*window_length_s
    window_indices = list(range(0, song_length, window_length))
    all_features = [extract_features_for_window(
            pop=pop, lib=lib, 
            window_start=window_idx, 
            window_end=min(window_idx+window_length, song_length)) 
        for window_idx in window_indices]
    return np.array(all_features)

# features_10s = pop_to_window_vectors(pop=pop, song_length=len(song), window_length_s=10, lib=lib, sr=sr)
# features_3s = pop_to_window_vectors(pop=pop, song_length=len(song), window_length_s=3, lib=lib, sr=sr)

# Divide songs into 4s windows with 2s overlap
def get_features_for_time_window(features, feature_window_length_s, window_start_s, window_end_s):
    # Calc in which window we start
    start_idx = window_start_s // feature_window_length_s
    # Calc in which window we end
    end_idx = window_end_s // feature_window_length_s
    # Get all features in between
    window_features = features[start_idx:(end_idx+1)]
    # Calc mean between feature vectors
    return np.mean(window_features, axis=0)

# song_feature_matrix = np.array([np.concatenate([
#     get_features_for_time_window(features_10s, 10, i, i+4), 
#     get_features_for_time_window(features_3s, 3, i, i+4)]) 
#     for i in range(0, len(song), sr*2)])

# # Create train and test sets
# X = song_feature_matrix
# y = np.repeat(labels[0], len(X))
# # Train Random Forests with 100 trees
# song_feature_matrix.shape

KeyboardInterrupt: 

### Feature Generation Wrapper Functions

In [4]:
def pop_to_window_vectors(pop:Population, song_length:int, window_length_s:int, lib:SampleLibrary, sr:int=22050):
    window_length = sr*window_length_s
    window_indices = list(range(0, song_length, window_length))
    all_features = [extract_features_for_window(
            pop=pop, lib=lib, 
            window_start=window_idx, 
            window_end=min(window_idx+window_length, song_length)) 
        for window_idx in window_indices]
    return np.array(all_features)

def get_features_for_time_window(features, feature_window_length_s, window_start_s, window_end_s):
    # Calc in which window we start
    start_idx = window_start_s // feature_window_length_s
    # Calc in which window we end
    end_idx = window_end_s // feature_window_length_s
    # Get all features in between
    window_features = features[start_idx:(end_idx+1)]
    # Calc mean between feature vectors
    return np.mean(window_features, axis=0)

def get_xy_for_pop(pop:Population, label:str, song_length:int, sr:int=22050):
    features_10s = pop_to_window_vectors(pop=pop, song_length=song_length, window_length_s=10, lib=lib, sr=sr)
    features_3s = pop_to_window_vectors(pop=pop, song_length=song_length, window_length_s=3, lib=lib, sr=sr)
    song_feature_matrix = np.array([np.concatenate([
        get_features_for_time_window(features_10s, 10, i, i+4), 
        get_features_for_time_window(features_3s, 3, i, i+4)]) 
        for i in range(0, int(song_length/sr), 2)])
    return song_feature_matrix, np.repeat(label, len(song_feature_matrix))

In [104]:
# Get Population files and correct song labels
# all_popfiles = glob("./experiments/1517_artists/300_1_10000_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
# Initial Populations (0 Generations)
all_popfiles = glob("./experiments/1517_artists/300_1_0_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
# For 500 Generations
# popfiles = [file.replace("\\Hip_Hop-", "\\Hip-Hop-") for file in all_popfiles if "500gens" in file and ".logger" not in file]
# For 10k Generations
popfiles = [file.replace("\\Hip_Hop-", "\\Hip-Hop-") for file in all_popfiles if "500gens" not in file and ".logger" not in file]
labels = [os.path.basename(file.split("-")[0]) for file in popfiles]
songnames = [file.split("-", 1)[1].split(".")[0] for file in popfiles]
sr = 22050
# Split songs into train/test sets so that we can later get errors per genre label
# Get songs per label
songs_per_label = {label: 0 for label in set(labels)}
for i in range(len(popfiles)):
    songs_per_label[labels[i]] += 1
# Shuffle
rnd_idx = list(range(len(popfiles)))
np.random.shuffle(rnd_idx)
# 80/20 Train/test split
popfiles_train = np.take(popfiles, rnd_idx[:int(0.8*len(rnd_idx))])
popfiles_test = np.take(popfiles, rnd_idx[int(0.8*len(rnd_idx)):])
labels_train = np.take(labels, rnd_idx[:int(0.8*len(rnd_idx))])
labels_test = np.take(labels, rnd_idx[int(0.8*len(rnd_idx)):])


In [91]:
# Build X, y
x_train = []
y_train = []
# Train Set
for i, popfile in enumerate(tqdm(popfiles_train)):
    pop = Population.from_file(popfile, expand=False)
    label = labels_train[i]
    song_length = max(pop.archive.keys()) + 1
    x, y_labels = get_xy_for_pop(pop=pop, label=label, song_length=song_length, sr=sr)
    x_train.append(x)
    y_train.append(y_labels)

x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)

100%|██████████| 2449/2449 [02:35<00:00, 15.79it/s]


In [92]:
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

In [93]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=0)
clf.fit(x_train, y_train)

In [94]:
# Split Test Set by Song, Label
x_test_by_song = []
y_test_by_song = []
for i, popfile in enumerate(tqdm(popfiles_test)):
    pop = Population.from_file(popfile, expand=False)
    label = labels_test[i]
    song_length = max(pop.archive.keys()) + 1
    x, y_labels = get_xy_for_pop(pop=pop, label=label, song_length=song_length, sr=sr)
    x_test_by_song.append(x)
    y_test_by_song.append(y_labels)

100%|██████████| 613/613 [00:39<00:00, 15.63it/s]


In [11]:
# Majority Voting across the windows of a piece
def majority_voting(votes):
    counts = {vote: 0 for vote in set(votes)}
    for vote in votes:
        counts[vote] += 1
    return list(counts.keys())[np.argmax(list(counts.values()))]

In [95]:
# Calculate errors for entire songs, per label
preds_per_label = {label: [[],[]] for label in labels}
y_true = []
y_pred = []
for i, song_x in enumerate(tqdm(x_test_by_song)):
    song_y = y_test_by_song[i]
    song_label = y_test_by_song[i][0]
    votes = clf.predict(song_x)
    vote_winner = majority_voting(votes)
    preds_per_label[song_label][0].append(song_label)
    preds_per_label[song_label][1].append(vote_winner)
    y_true.append(song_label)
    y_pred.append(vote_winner)



100%|██████████| 613/613 [00:05<00:00, 120.22it/s]


In [96]:
from sklearn.metrics import f1_score, precision_score, recall_score
f1_score(y_true, y_pred, average='micro', labels=labels)

0.11141945124282034

In [97]:
# f1 per label
f1_per_label = dict()
recall_per_label = dict()
precision_per_label = dict()
for label in preds_per_label:
    y_true, y_pred = preds_per_label[label]
    f1_per_label[label] = f1_score(y_true, y_pred, average='micro', labels=labels)
    recall_per_label[label] = recall_score(y_true, y_pred, average='micro', labels=labels)
    precision_per_label[label] = precision_score(y_true, y_pred, average='micro', labels=labels)
f1_per_label, recall_per_label, precision_per_label

({'Alternative_and_Punk': 0.157651376146789,
  'Blues': 0.030862592110966626,
  'Childrenss': 0.0,
  'Classical': 0.4247817327065144,
  'Comedy_and_Spoken_Word': 0.0,
  'Country': 0.0,
  'Easy_Listening_and_Vocals': 0.033690360272638754,
  'Electronic_and_Dance': 0.3972667742241625,
  'Folk': 0.3279569892473118,
  'Hip': 0.0,
  'Jazz': 0.3747688114474837,
  'Latin': 0.0,
  'New_Age': 0.12556122060725974,
  'Reggae': 0.12920721008135266,
  'Religious': 0.0,
  'Rock_and_Pop': 0.1076120959332638,
  'R_and_B_and_Soul': 0.024248584289240597,
  'Soundtracks_and_More': 0.0,
  'World': 0.0},
 {'Alternative_and_Punk': 0.15384615384615385,
  'Blues': 0.030303030303030304,
  'Childrenss': 0.0,
  'Classical': 0.4782608695652174,
  'Comedy_and_Spoken_Word': 0.0,
  'Country': 0.0,
  'Easy_Listening_and_Vocals': 0.03225806451612903,
  'Electronic_and_Dance': 0.40625,
  'Folk': 0.3157894736842105,
  'Hip': 0.0,
  'Jazz': 0.36666666666666664,
  'Latin': 0.0,
  'New_Age': 0.12195121951219512,
  'Reggae'

In [105]:
#calc fitnesses
label_fitnesses = {label: [] for label in set(labels)}
discarded = []
for i, popfile in enumerate(tqdm(popfiles)):
    pop = Population.from_file(popfile, expand=False)
    fitnesses = [record.fitness for record in pop.archive.values()]
    mean_fitness = np.mean(fitnesses)
    if mean_fitness < 10**4:
        label_fitnesses[labels[i]].append(mean_fitness)
    else:
        discarded.append(mean_fitness)

100%|██████████| 3067/3067 [00:26<00:00, 115.90it/s]


In [107]:
mean_label_fitnesses = {label: np.mean(label_fitnesses[label]) for label in label_fitnesses}
mean_label_fitnesses

{'Country': 35.994076656258535,
 'World': 3.7500810689524666,
 'Jazz': 69.67296040641808,
 'Soundtracks_and_More': 2.0275951983369707,
 'Blues': 12.86501812049024,
 'Classical': 4.618699124432691,
 'Folk': 10.717715124340243,
 'R_and_B_and_Soul': 63.33962704998383,
 'Latin': 8.294410389258113,
 'Reggae': 64.06526524509258,
 'Religious': 8.798272774565245,
 'Hip': 2.15256387016661,
 'Easy_Listening_and_Vocals': 22.836141936281187,
 'Electronic_and_Dance': 26.342626401274977,
 'Alternative_and_Punk': 6.926078724505477,
 'New_Age': 51.7937330441161,
 'Childrenss': 3.484386581495564,
 'Comedy_and_Spoken_Word': 2.855394249713554,
 'Rock_and_Pop': 25.383478351013228}

In [100]:
import pandas as pd
df = pd.DataFrame(list(f1_per_label.values()), index=list(f1_per_label.keys()), columns=["F1_Score"])
df["Precision"] = list(precision_per_label.values())
df["Recall"] = list(recall_per_label.values())
df["Fitness"] = mean_label_fitnesses
df

Unnamed: 0,F1_Score,Precision,Recall,Fitness
Alternative_and_Punk,0.157651,0.16165,0.153846,5.443492
Blues,0.030863,0.031443,0.030303,70.618356
Childrenss,0.0,0.0,0.0,2.959346
Classical,0.424782,0.38206,0.478261,4.803002
Comedy_and_Spoken_Word,0.0,0.0,0.0,2.320552
Country,0.0,0.0,0.0,54.933176
Easy_Listening_and_Vocals,0.03369,0.035256,0.032258,34.42513
Electronic_and_Dance,0.397267,0.388672,0.40625,22.79277
Folk,0.327957,0.3411,0.315789,58.211149
Hip,0.0,0.0,0.0,2.107658


In [103]:
with open("./table500.tex", "w") as fp:
    df.round(3).to_latex(buf=fp, index=False)

  df.round(3).to_latex(buf=fp, index=False)


In [112]:
# Combine all tables
df = pd.read_csv('table0.tex',
                 sep='&',
                 header=None,
                 skiprows=4,
                 skipfooter=5,
                 engine='python')
df

Unnamed: 0,0,1,2,3
0,0.125,0.129,0.121,6.926 \\
1,0.000,0.0,0.0,12.865 \\
2,0.000,0.0,0.0,3.484 \\
3,0.575,0.532,0.625,4.618 \\
4,0.000,0.0,0.0,2.855 \\
5,0.000,0.0,0.0,35.994 \\
6,0.000,0.0,0.0,22.836 \\
7,0.375,0.368,0.382,26.342 \\
8,0.000,0.0,0.0,10.717 \\
9,0.036,0.034,0.038,2.152 \\


In [13]:
mean_errors_per_label = {label: np.mean(errors) for label, errors in errors_per_label.items()}
mean_errors_per_label

{'Alternative_and_Punk': 0.8536585365853658,
 'Blues': 1.0,
 'Childrenss': 1.0,
 'Classical': 0.2916666666666667,
 'Comedy_and_Spoken_Word': 0.9473684210526315,
 'Country': 1.0,
 'Easy_Listening_and_Vocals': 0.9696969696969697,
 'Electronic_and_Dance': 0.6764705882352942,
 'Folk': 1.0,
 'Hip': 0.9615384615384616,
 'Jazz': 0.45161290322580644,
 'Latin': 1.0,
 'New_Age': 0.7419354838709677,
 'Reggae': 0.9166666666666666,
 'Religious': 1.0,
 'Rock_and_Pop': 0.8857142857142857,
 'R_and_B_and_Soul': 0.8108108108108109,
 'Soundtracks_and_More': 1.0,
 'World': 1.0}

In [14]:
mean_error_majority_votes = np.mean(list(mean_errors_per_label.values()))
mean_error_majority_votes

0.8687968312665223

Errors per window, irrespective of song

In [45]:
x_test = np.concatenate(x_test_by_song)
y_test = np.concatenate(y_test_by_song)
predictions = clf.predict(x_test)
errors = [1*(not predictions[i] == y_test[i]) for i in range(len(predictions))]
np.mean(errors)

0.8738395894597244