In [1]:
%cd ..
#%load_ext autoreload
#%autoreload 2
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import librosa
import numpy as np

from evoaudio.population import Population
from evoaudio.sample_library import SampleLibrary
from evoaudio.feature_extraction import extract_features_for_window, extract_features_for_windows


c:\Users\justi\coding\Uni\MA


  from .autonotebook import tqdm as notebook_tqdm


### Population to feature vector of shape (n_features)

"The following statistics are proposed as approximative features.  
For each instrument and each onset in the approximated
music track, we save the smallest distance between the best
candidate mixture which contains this instrument and the onset
to approximate.  
(pop.best_collections_per_onset)  
  
These smallest distances are kept during the
complete evolutionary loop in an archive and do not represent
the final population only. (TODO!) 
    
Then, we estimate the mean, the
minimum, and the maximum values for each of 51 instrument
and 88 theoretically possible pitches for two different analysis
frames of 10s and 3s. Additionally, we sort the recognised
instruments based on the smallest distances, and assign ranks
to corresponding approximative features, e.g., value of “rang
of acoustic guitar” = 1 means that acoustic guitar had the
smallest mean distance between approximations with this
instrument and unknown onsets in the analysis frame. This
leads to an overall number of feature dimensions equal to
(51 · 3 + 88 · 3 + 51) · 2 = 936."

In [2]:
pop = Population.from_file("30k_gen_nutcracker.pkl")
target, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
lib = SampleLibrary()

Loading samples: 100%|██████████| 6478/6478 [00:16<00:00, 403.26it/s]


In [3]:
## Choose two different analysis frames of 10s and 3s
end_offset = 10 * sr
possible_onsets = len(target) - end_offset
window_start = np.random.randint(low=0, high=possible_onsets)
window_end = window_start + end_offset

## Grab the onsets in those windows, and the associated best records from the population
relevant_collections = [collection for collection in pop.best_collections_per_onset.values() if collection.onset in range(window_start, window_end)]

## For each window separately, calculate the maximum, minimum, and mean fitnesses for each occasion of
## An instrument
instrument_occurrences_fitness = dict()
## A pitch
pitch_occurrences_fitness = dict()

for collection in relevant_collections:
    for sample in collection.samples:
        if sample.instrument in instrument_occurrences_fitness:
            instrument_occurrences_fitness[sample.instrument].append(collection.fitness)
        else:
            instrument_occurrences_fitness[sample.instrument] = [collection.fitness]
        if sample.pitch in pitch_occurrences_fitness:
            pitch_occurrences_fitness[sample.pitch].append(collection.fitness)
        else:
            pitch_occurrences_fitness[sample.pitch] = [collection.fitness]

# print(instrument_occurrences_fitness)
# print(pitch_occurrences_fitness)

instrument_min = {instrument: np.min(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}
instrument_max = {instrument: np.max(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}
instrument_mean = {instrument: np.mean(instrument_occurrences_fitness[instrument]) for instrument in instrument_occurrences_fitness}

pitch_min = {pitch: np.min(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}
pitch_max = {pitch: np.max(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}
pitch_mean = {pitch: np.mean(pitch_occurrences_fitness[pitch]) for pitch in pitch_occurrences_fitness}

## Finally, give each instrument a rank from 1 to n_instruments, based on their mean distances (smallest = rank 1, highest = rank n_instruments)
# instrument_sort = np.argsort([instrument_mean[instrument] for instrument in instrument_mean])
instrument_sort = {k: v for k, v in sorted(instrument_mean.items(), key=lambda item: item[1])}
instrument_ranks = {instrument: i + 1 for i, instrument in enumerate(instrument_sort)}

# Create feature vector
instrument_features = []
for instrument_info in lib.instruments:
    instr_name = instrument_info.name
    if instr_name in instrument_ranks:
        instrument_features.append([instrument_min[instr_name], instrument_mean[instr_name], instrument_max[instr_name], instrument_ranks[instr_name]])
    else:
        instrument_features.append([np.inf, np.inf, np.inf, np.inf])
pitch_features = []
for pitch in lib.pitches:
    if pitch in pitch_min:
        pitch_features.append([pitch_min[pitch], pitch_mean[pitch], pitch_max[pitch]])
    else:
        pitch_features.append([np.inf, np.inf, np.inf])
flat_instr_features = np.array(instrument_features).flatten()
flat_pitch_features = np.array(pitch_features).flatten()
features = np.concatenate((flat_instr_features, flat_pitch_features))
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")


467 features generated from 50 instruments and 89 pitches.


Single Window

In [6]:
end_offset = 10 * sr
possible_onsets = len(target) - end_offset
window_start = np.random.randint(low=0, high=possible_onsets)
window_end = window_start + end_offset
features = extract_features_for_window(pop, lib, window_start, window_end)
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")

467 features generated from 50 instruments and 89 pitches.


Multiple Windows

In [4]:
features = extract_features_for_windows(pop=pop, lib=lib, window_lengths=[3, 10], n_total_samples=len(target), sr=sr)
print(f"{len(features)} features generated from {len(lib.instruments)} instruments and {len(lib.pitches)} pitches.")

934 features generated from 50 instruments and 89 pitches.


In [2]:
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [5]:
X = [[0], [1], [2]]
y = ["a", "b", "c"]
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[2]]))

['c']


### Training of Random Forests from Saved Populations

In [2]:
lib = SampleLibrary()

Loading samples: 100%|██████████| 6826/6826 [00:11<00:00, 616.54it/s]


For 10k generations

In [3]:
from glob import glob
import os

from tqdm import tqdm
# Load populations, Load Songs
all_popfiles = glob("./experiments/1517_artists/300_1_10000_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
popfiles = [file.replace("Hip-Hop", "Hip_Hop") for file in all_popfiles if "500gens" not in file and ".logger" not in file]
labels = [os.path.basename(file.split("-")[0]) for file in popfiles]
songnames = [file.split("-", 1)[1].split(".")[0] for file in popfiles]
# all_soundfiles = glob("./audio/1517-Artists/**/*.mp3", recursive=True)
# soundfiles = []
# for name in songnames:
#     for file in all_soundfiles:
#         if name in file:
#             soundfiles.append(file)
#             break
# lib = SampleLibrary()
pops = [Population.from_file(popfile.replace("Hip_Hop", "Hip-Hop")) for popfile in popfiles]
# songs = [librosa.load(soundfile)[0] for soundfile in soundfiles]
sr = 22050

# Separate the archive into 10s and 3s chunks

# Create feature vectors for pops for windows of 10s and 3s across the song
def pop_to_window_vectors(pop:Population, song_length:int, window_length_s:int, lib:SampleLibrary, sr:int=22050):
    window_length = sr*window_length_s
    window_indices = list(range(0, song_length, window_length))
    all_features = [extract_features_for_window(
            pop=pop, lib=lib, 
            window_start=window_idx, 
            window_end=min(window_idx+window_length, song_length)) 
        for window_idx in window_indices]
    return np.array(all_features)

# features_10s = pop_to_window_vectors(pop=pop, song_length=len(song), window_length_s=10, lib=lib, sr=sr)
# features_3s = pop_to_window_vectors(pop=pop, song_length=len(song), window_length_s=3, lib=lib, sr=sr)

# Divide songs into 4s windows with 2s overlap
def get_features_for_time_window(features, feature_window_length_s, window_start_s, window_end_s):
    # Calc in which window we start
    start_idx = window_start_s // feature_window_length_s
    # Calc in which window we end
    end_idx = window_end_s // feature_window_length_s
    # Get all features in between
    window_features = features[start_idx:(end_idx+1)]
    # Calc mean between feature vectors
    return np.mean(window_features, axis=0)

# song_feature_matrix = np.array([np.concatenate([
#     get_features_for_time_window(features_10s, 10, i, i+4), 
#     get_features_for_time_window(features_3s, 3, i, i+4)]) 
#     for i in range(0, len(song), sr*2)])

# # Create train and test sets
# X = song_feature_matrix
# y = np.repeat(labels[0], len(X))
# # Train Random Forests with 100 trees
# song_feature_matrix.shape

FileNotFoundError: [Errno 2] No such file or directory: './experiments/1517_artists/300_1_10000_0.05_5_10_1_20_0.9954_15_1sec\\Hip-Hop-Main_Flow-Hip-Hop_Worth_Dying_For_Featuring_Talib_Kweli.pkl'

### Feature Generation Wrapper Functions

In [4]:
def pop_to_window_vectors(pop:Population, song_length:int, window_length_s:int, lib:SampleLibrary, sr:int=22050):
    window_length = sr*window_length_s
    window_indices = list(range(0, song_length, window_length))
    all_features = [extract_features_for_window(
            pop=pop, lib=lib, 
            window_start=window_idx, 
            window_end=min(window_idx+window_length, song_length)) 
        for window_idx in window_indices]
    return np.array(all_features)

def get_features_for_time_window(features, feature_window_length_s, window_start_s, window_end_s):
    # Calc in which window we start
    start_idx = window_start_s // feature_window_length_s
    # Calc in which window we end
    end_idx = window_end_s // feature_window_length_s
    # Get all features in between
    window_features = features[start_idx:(end_idx+1)]
    # Calc mean between feature vectors
    return np.mean(window_features, axis=0)

def get_xy_for_pop(pop:Population, label:str, song_length:int, sr:int=22050):
    features_10s = pop_to_window_vectors(pop=pop, song_length=song_length, window_length_s=10, lib=lib, sr=sr)
    features_3s = pop_to_window_vectors(pop=pop, song_length=song_length, window_length_s=3, lib=lib, sr=sr)
    song_feature_matrix = np.array([np.concatenate([
        get_features_for_time_window(features_10s, 10, i, i+4), 
        get_features_for_time_window(features_3s, 3, i, i+4)]) 
        for i in range(0, int(song_length/sr), 2)])
    return song_feature_matrix, np.repeat(label, len(song_feature_matrix))

In [9]:
# Get Population files and correct song labels
# all_popfiles = glob("./experiments/1517_artists/300_1_10000_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
# Initial Populations (0 Generations)
all_popfiles = glob("./experiments/1517_artists/300_1_0_0.05_5_10_1_20_0.9954_15_1sec/*.pkl")
# For 500 Generations
# popfiles = [file.replace("\\Hip_Hop-", "\\Hip-Hop-") for file in all_popfiles if "500gens" in file and ".logger" not in file]
# For 10k Generations
popfiles = [file.replace("\\Hip_Hop-", "\\Hip-Hop-") for file in all_popfiles if "500gens" not in file and ".logger" not in file]
labels = [os.path.basename(file.split("-")[0]) for file in popfiles]
songnames = [file.split("-", 1)[1].split(".")[0] for file in popfiles]
sr = 22050
# Split songs into train/test sets so that we can later get errors per genre label
# Get songs per label
songs_per_label = {label: 0 for label in set(labels)}
for i in range(len(popfiles)):
    songs_per_label[labels[i]] += 1
# Shuffle
rnd_idx = list(range(len(popfiles)))
np.random.shuffle(rnd_idx)
# 80/20 Train/test split
popfiles_train = np.take(popfiles, rnd_idx[:int(0.8*len(rnd_idx))])
popfiles_test = np.take(popfiles, rnd_idx[int(0.8*len(rnd_idx)):])
labels_train = np.take(labels, rnd_idx[:int(0.8*len(rnd_idx))])
labels_test = np.take(labels, rnd_idx[int(0.8*len(rnd_idx)):])


In [10]:
# Build X, y
x_train = []
y_train = []
# Train Set
for i, popfile in enumerate(tqdm(popfiles_train)):
    pop = Population.from_file(popfile, expand=False)
    label = labels_train[i]
    song_length = max(pop.archive.keys()) + 1
    x, y_labels = get_xy_for_pop(pop=pop, label=label, song_length=song_length, sr=sr)
    x_train.append(x)
    y_train.append(y_labels)

x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)

100%|██████████| 2453/2453 [02:27<00:00, 16.59it/s]


In [11]:
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

In [12]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=0)
clf.fit(x_train, y_train)

In [13]:
# Split Test Set by Song, Label
x_test_by_song = []
y_test_by_song = []
for i, popfile in enumerate(tqdm(popfiles_test)):
    pop = Population.from_file(popfile, expand=False)
    label = labels_test[i]
    song_length = max(pop.archive.keys()) + 1
    x, y_labels = get_xy_for_pop(pop=pop, label=label, song_length=song_length, sr=sr)
    x_test_by_song.append(x)
    y_test_by_song.append(y_labels)

100%|██████████| 614/614 [00:36<00:00, 16.79it/s]


In [14]:
# Majority Voting across the windows of a piece
def majority_voting(votes):
    counts = {vote: 0 for vote in set(votes)}
    for vote in votes:
        counts[vote] += 1
    return list(counts.keys())[np.argmax(list(counts.values()))]

In [15]:
# Calculate errors for entire songs, per label
errors_per_label = {label: [] for label in labels}
for i, song_x in enumerate(tqdm(x_test_by_song)):
    song_y = y_test_by_song[i]
    song_label = y_test_by_song[i][0]
    votes = clf.predict(song_x)
    vote_winner = majority_voting(votes)
    if vote_winner == song_label:
        errors_per_label[song_label].append(0)
    else:
        errors_per_label[song_label].append(1)

100%|██████████| 614/614 [00:05<00:00, 122.63it/s]


In [16]:
mean_errors_per_label = {label: np.mean(errors) for label, errors in errors_per_label.items()}
mean_errors_per_label

{'Alternative_and_Punk': 0.8,
 'Blues': 0.972972972972973,
 'Childrenss': 1.0,
 'Classical': 0.2777777777777778,
 'Comedy_and_Spoken_Word': 1.0,
 'Country': 1.0,
 'Easy_Listening_and_Vocals': 1.0,
 'Electronic_and_Dance': 0.42857142857142855,
 'Folk': 0.8888888888888888,
 'Hip': 0.9655172413793104,
 'Jazz': 0.696969696969697,
 'Latin': 1.0,
 'New_Age': 0.8275862068965517,
 'Reggae': 1.0,
 'Religious': 1.0,
 'Rock_and_Pop': 0.7142857142857143,
 'R_and_B_and_Soul': 0.8846153846153846,
 'Soundtracks_and_More': 1.0,
 'World': 1.0}

In [17]:
mean_error_majority_votes = np.mean(list(mean_errors_per_label.values()))
mean_error_majority_votes

0.8661676480188277

Errors per window, irrespective of song

In [18]:
x_test = np.concatenate(x_test_by_song)
y_test = np.concatenate(y_test_by_song)
predictions = clf.predict(x_test)
errors = [1*(not predictions[i] == y_test[i]) for i in range(len(predictions))]
np.mean(errors)

0.8714480201604727