# Automating with split by song

In [1]:
%matplotlib inline
import numpy, scipy, matplotlib.pyplot as plt, IPython.display as ipd
import sklearn, pandas as pd
import librosa, librosa.display
import sys
import time
from pathlib import Path
import numpy, urllib
import librosa.feature as lf
from sklearn.metrics import classification_report, confusion_matrix
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [None]:
x = [np.array(1),np.array(2),np.array(3),np.array(4),np.array(5),np.array(6)]

In [None]:
random.shuffle(x)

In [None]:
x

In [None]:
test_test = len(x)//4
test_train = len(x) - test_test
train = x[:test_train]
test = x[test_train:]

In [None]:
train

In [None]:
test

In [2]:
plt.style.use('seaborn-muted')
plt.rcParams['figure.figsize'] = (14, 5)
plt.rcParams['axes.grid'] = True
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.xmargin'] = 0
plt.rcParams['axes.ymargin'] = 0
plt.rcParams['image.cmap'] = 'gray'
plt.rcParams['image.interpolation'] = None

In [3]:
fcl = [lf.tempogram, lf.chroma_stft, 
lf.chroma_cqt, 
lf.chroma_cens,
lf.melspectrogram,
lf.mfcc,
lf.rms,
lf.rmse,
lf.spectral_centroid,
lf.spectral_bandwidth,
lf.spectral_contrast,
lf.spectral_flatness,
lf.spectral_rolloff,
lf.poly_features,
lf.tonnetz,
lf.zero_crossing_rate]

In [None]:
fcl = [lf.tempogram, lf.chroma_stft, 
lf.chroma_cqt, 
lf.chroma_cens,
lf.melspectrogram,
lf.mfcc,
lf.rms,
lf.rmse,
lf.spectral_centroid,
lf.spectral_bandwidth,
lf.spectral_contrast,
lf.spectral_flatness,
lf.spectral_rolloff,
lf.poly_features,
lf.tonnetz,
lf.zero_crossing_rate]

In [4]:
def folder_to_features_separated_song(folder_name):
    song_count = 0
    mp3_names =[
        str(p)[len(f'{folder_name}/'):-len('.mp3')] for p in Path().glob(f'{folder_name}/*.mp3')
    ]
    
    audio_time_series_list = [
        librosa.load(p)[0] for p in Path().glob(f'{folder_name}/*.mp3')
    ]
    
    by_song = []
    for song in audio_time_series_list: 
        for f in fcl:
            if f == lf.tempogram:
                feature_i = f(y=song).T
            else:
                feature_i = np.hstack((feature_i, f(y=song).T))
        total_array = feature_i
        label_array = np.full((total_array.shape[0],1), folder_name)
        total_array = np.hstack((label_array, total_array))
        by_song.append(total_array)
        song_count += 1
    return by_song

In [5]:
def combine_multiple_folder_sets_separated_song(folder_name_list):
    start = time.time()
    all_data_separate_song = []
    for folder_data in folder_name_list:
        all_data_separate_song += folder_to_features_separated_song(folder_data)
        print(f'{folder_data} folder complete at {time.time()-start} seconds from start')
    return all_data_separate_song

In [6]:
def separate_and_scale_features_and_labels(list_of_data_by_song, frac = 0.05):
    random.shuffle(list_of_data_by_song)
    number_test = len(list_of_data_by_song)//4
    number_train = len(list_of_data_by_song) - number_test
    train = list_of_data_by_song[:number_train]
    test = list_of_data_by_song[number_train:]
    train = np.vstack(train)
    test = np.vstack(test)
    y_train, X_train = np.split(train,[1], axis=1)
    y_test, X_test = np.split(test,[1], axis=1)
    scaler = sklearn.preprocessing.StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [None]:
pd.DataFrame(y_test)[0].unique()

In [7]:
def try_sgdc(X_train_scaled, X_test_scaled, y_train, y_test, max_iter = 5):
    model = SGDClassifier(loss='hinge', max_iter = max_iter)
    model.fit(X_train_scaled, y_train)
    #predicted_labels = model.predict(X_train_scaled)
    #score_list.append(model.score(X_test_scaled, y_test))
    return model, model.score(X_test_scaled, y_test)

In [None]:
all_data = combine_multiple_folder_sets_separated_song(['classical_small_test','romantic_small_test','baroque_small_test'])


In [None]:
X_train_scaled, X_test_scaled, y_train, y_test, scaler = separate_and_scale_features_and_labels(all_data)



In [None]:
model, score = try_sgdc(X_train_scaled, X_test_scaled, np.ravel(y_train), np.ravel(y_test), max_iter = 10)

In [None]:
score

In [None]:
print(classification_report(y_test, model.predict(X_test_scaled)))
#all of the test labels ended up being baroque, which is why these metrics are so weird. It's probably because

In [None]:
confusion_matrix(y_test, model.predict(X_test_scaled))

In [8]:
def run_sgdc_from_scratch(folder_name_list):
    #name the folders the name of the label they belong to
    #folder_name_list is a list of the folder names like: ['classical_small_test','romantic_small_test','baroque_small_test']
    #make sure the folders are in the first level of the file you were cd'ed into when you opened the jupyter notebook
    start = time.time()
    data = combine_multiple_folder_sets_separated_song(folder_name_list)
    print(f'folder to data: {time.time()-start}')
    X_train_scaled, X_test_scaled, y_train, y_test, scaler = separate_and_scale_features_and_labels(data)
    print(f'data scaled: {time.time()-start}')
    model, score = try_sgdc(X_train_scaled, X_test_scaled, y_train, y_test)
    print(f'total time: {time.time()-start}')
    return data, X_train_scaled, X_test_scaled, y_train, y_test, scaler, model, score

In [None]:

all_data, X_train_scaled, X_test_scaled, y_train, y_test, scaler, model, score = run_sgdc_from_scratch(['classical_small_test','romantic_small_test','baroque_small_test'])



In [None]:
all_data

In [None]:
720/60

In [None]:
score

In [None]:
confusion_matrix(y_test, model.predict(X_test_scaled))

In [None]:

all_data, X_train_scaled, X_test_scaled, y_train, y_test, scaler, model, score = run_sgdc_from_scratch(['classical_small_test','romantic_small_test'])



In [None]:
y_test.shape

In [None]:
y_train.shape

In [None]:

all_data, X_train_scaled, X_test_scaled, y_train, y_test, scaler, model, score = run_sgdc_from_scratch(['piano_classical','piano_romantic'])



In [15]:
test = np.array([3,4])
np.save('test_array_save', test)

In [16]:
test_load = np.load('test_array_save.npy')

In [17]:
test_load

array([3, 4])

In [23]:
test2_load = np.load('piano_classicalqqqbeethoven_hammerklavier_1.npy')


In [33]:
type(test2_load[5][5])

numpy.str_

In [26]:
25000*590

14750000

In [27]:
pd.DataFrame(test2_load).dtypes

0      object
1      object
2      object
3      object
4      object
5      object
6      object
7      object
8      object
9      object
10     object
11     object
12     object
13     object
14     object
15     object
16     object
17     object
18     object
19     object
20     object
21     object
22     object
23     object
24     object
25     object
26     object
27     object
28     object
29     object
        ...  
561    object
562    object
563    object
564    object
565    object
566    object
567    object
568    object
569    object
570    object
571    object
572    object
573    object
574    object
575    object
576    object
577    object
578    object
579    object
580    object
581    object
582    object
583    object
584    object
585    object
586    object
587    object
588    object
589    object
590    object
Length: 591, dtype: object

In [37]:
np.arange(100).reshape(10,10)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [38]:
np.arange(100).reshape(10,10).mean(axis = 0)

array([45., 46., 47., 48., 49., 50., 51., 52., 53., 54.])

In [20]:
def save_folder_to_features_separated_song(folder_name):
    start=time.time()
    song_count = 0
    mp3_names =[
        str(p)[len(f'{folder_name}/'):-len('.mp3')] for p in Path().glob(f'{folder_name}/*.mp3')
    ]
    
    audio_time_series_list = [
        librosa.load(p)[0] for p in Path().glob(f'{folder_name}/*.mp3')
    ]
    
    for song in audio_time_series_list: 
        for f in fcl:
            if f == lf.tempogram:
                feature_i = f(y=song).T
            else:
                feature_i = np.hstack((feature_i, f(y=song).T))
        total_array = feature_i
        label_array = np.full((total_array.shape[0],1), folder_name)
        total_array = np.hstack((label_array, total_array))
        np.save(f'{folder_name}qqq{mp3_names[song_count]}', total_array)
        print(f'one song done {time.time()-start}')
        song_count += 1

In [19]:
def save_multiple_folder_sets_separated_song(folder_name_list):
    start = time.time()
    all_data_separate_song = []
    for folder_data in folder_name_list:
        all_data_separate_song += save_folder_to_features_separated_song(folder_data)
        print(f'{folder_data} folder complete at {time.time()-start} seconds from start')

In [21]:
save_folder_to_features_separated_song('piano_classical')

one song done 1222.4361310005188
one song done 1248.2360880374908
one song done 1409.7087342739105
one song done 1540.0566132068634
one song done 1607.5129661560059
one song done 1641.8286950588226
one song done 1702.002440214157
one song done 1782.2545731067657
one song done 1862.9447610378265
one song done 1904.0791232585907
one song done 1986.2700290679932
one song done 2049.7962930202484
one song done 2083.528533935547
one song done 2140.050796031952
one song done 2198.057020187378
one song done 2299.8466260433197
one song done 2331.254795074463
one song done 2365.972659111023
one song done 2392.9672832489014
one song done 2451.488550186157
one song done 2506.9496190547943
one song done 2552.033716201782
one song done 2695.8404190540314
one song done 2750.7539920806885
one song done 2829.723039865494
one song done 2914.9297490119934
one song done 2932.4911909103394
one song done 2945.992776155472
one song done 2956.771464109421
one song done 2980.563900232315
one song done 2993.177

In [22]:
5501/60

91.68333333333334