In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import utils

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def get_matrices(f,g):
    if len(g.shape) == 1:
        return np.outer(g,f)
    else:
        A = []
        for i in range(g.shape[0]):
            A.append(np.outer(g[i],f[i]))
        return np.array(A)
    
def matrix_sum(Ai_s):
    if len(Ai_s.shape) == 2:
        return Ai_s
    
    A = np.array([])
    for i, Ai in enumerate(Ai_s):
        if i == 0:
            A = Ai
        else:
            A = A + Ai
    return A

def matrix_vector_product(A,f):
    if len(f.shape) == 1:
        return np.dot(A,f)
    g_prime = []
    for fi in f:
        g_prime.append(np.dot(A,fi))
    return np.array(g_prime)

def normalization(vector):
    unit_vector = vector/np.sqrt(np.sum(vector**2))
    return unit_vector

def cosine(a, b):
    dot_product = np.dot(a, b)

    magnitude_a = np.linalg.norm(a)
    magnitude_b = np.linalg.norm(b)

    cosine_theta = dot_product / (magnitude_a * magnitude_b)
    return cosine_theta

def mean_length(vectors):
    if len(vectors.shape) == 1:
        return np.linalg.norm(vectors)
    len_list = np.array([])
    for vector in vectors:
        len_list = np.hstack((len_list, np.linalg.norm(vector)))
    
    return np.mean(len_list)

def get_accuracy(target,predicted):
    correct = 0
    for i in range(target.shape[0]):
        if np.array_equal(target[i], predicted[i]):
            correct+=1
    accuracy = correct/target.shape[0]
    return accuracy

This is the Widrow-Hoff algorithm, and it is used to train the model.

In [3]:
def wh_algo(f, g, A):
    epsilon = 1/1000
    j = 0
    not_converge = True
    error_array = np.array([])
    suffle = np.arange(f.shape[0])
    while not_converge:
        
        if j%f.shape[0] == 0:
            np.random.shuffle(suffle)
        
        if len(error_array) == 0:
            prev_error = mean_length(g - matrix_vector_product(A,f))
            error_array = np.append(error_array, prev_error)
        
        i = j%f.shape[0] 
        k = (1/np.linalg.norm(f[i])-epsilon)/(j+1)
        g_prime_i = np.dot(A,f[i])
        error = g[i] - g_prime_i
        delta_A = np.multiply(k,np.outer(error,f[i]))
       # print(delta_A)
        A = A + delta_A
        A_prime_f = matrix_vector_product(A,f)
        
        current_error = mean_length(g - A_prime_f)
        error_array = np.append(error_array, current_error)
        
        if len(error_array)>50:
            error_percent = np.abs(1-(error_array[-1]/error_array[-50]))
            if error_percent < 0.05:
                not_converge = False
                
        j = j+1
        
    return A

The function `figure_genre` interprets the predicted genre, and `get_genre_list` returns an array of genres.  

In [4]:
def figure_genre(A, f):
    
    classical = abs(cosine(np.dot(A,f), np.array([1,0])))
    metal = abs(cosine(np.dot(A,f), np.array([0,1])))
    
    if classical > metal:
        return "classical", np.array([1,0])
    else:
        return "metal", np.array([0,1])
    
def get_genre_list(A, X):
    genre_list = []
    for x in X:
        genre, vector = figure_genre(A, x)
        genre_list.append(vector)
    genre_list = np.array(genre_list)
    
    return genre_list

# Data Processing

In [5]:
df = pd.read_csv("./data/features_3_sec.csv")
df

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.241280,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.001450,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.030830,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.004620,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.768610,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,rock.00099.5.wav,66149,0.349126,0.080515,0.050019,0.000097,1499.083005,164266.886443,1718.707215,85931.574523,...,42.485981,-9.094270,38.326839,-4.246976,31.049839,-5.625813,48.804092,1.818823,38.966969,rock
9986,rock.00099.6.wav,66149,0.372564,0.082626,0.057897,0.000088,1847.965128,281054.935973,1906.468492,99727.037054,...,32.415203,-12.375726,66.418587,-3.081278,54.414265,-11.960546,63.452255,0.428857,18.697033,rock
9987,rock.00099.7.wav,66149,0.347481,0.089019,0.052403,0.000701,1346.157659,662956.246325,1561.859087,138762.841945,...,78.228149,-2.524483,21.778994,4.809936,25.980829,1.775686,48.582378,-0.299545,41.586990,rock
9988,rock.00099.8.wav,66149,0.387527,0.084815,0.066430,0.000320,2084.515327,203891.039161,2018.366254,22860.992562,...,28.323744,-5.363541,17.209942,6.462601,21.442928,2.354765,24.843613,0.675824,12.787750,rock


In [6]:
# Data cleaning
df = df.drop(columns=["length", "filename"])

In [7]:
# Select only the metal and classical genres
filtered_df = df[df["label"].isin(["metal", "classical"])]
filtered_df.reset_index(inplace=True, drop = True)

In [8]:
filtered_df[filtered_df["label"] == "classical"].shape

(998, 58)

In [9]:
filtered_df[filtered_df["label"] == "metal"].shape

(1000, 58)

There are classical 998 samples and 1000 samples for metal.

# Feature Selection

In [10]:
# Get the mean of each feature
filtered_average_group = filtered_df.groupby('label').mean()
filtered_average_group

Unnamed: 0_level_0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
classical,0.264508,0.083363,0.042812,0.000403,1351.320102,70830.230588,1519.531577,47788.745702,2475.817622,363544.530781,...,1.520216,53.650671,-0.181774,55.738733,0.828762,57.036661,-1.210227,65.911659,-0.980228,77.626622
metal,0.480162,0.07237,0.153454,0.00106,2600.881732,183577.335324,2242.819811,58359.357255,5123.753216,654842.379673,...,5.801267,34.106649,-8.284396,35.894936,3.206265,33.639106,-5.025991,34.254769,1.561031,30.856026


We want to train the linear associator with the most distinct values between the two genres. To do this, we first get the variance-to-mean ratio (coefficient of variation) between classical and metal for each feature and extract 10 features with the largest values.

In [11]:
std = filtered_average_group.std()

# Calculate the percentage variance for each feature
percentage_std = abs(std / filtered_average_group.mean()) * 100

# Find the feature with the largest percentage variance
largest_percentage_std_feature = percentage_std.nlargest(10)
largest_percentage_std_value = percentage_std.max()

print("Feature with the largest percentage variance:")
print(largest_percentage_std_feature)

Feature with the largest percentage variance:
mfcc20_mean      618.777666
mfcc14_mean      137.577395
mfcc17_mean      135.348546
mfcc12_mean      132.077254
perceptr_var     131.637828
mfcc8_mean       126.376826
mfcc15_mean      111.463783
mfcc10_mean      104.197285
mfcc6_mean        99.039036
perceptr_mean     97.946495
dtype: float64


Next, we examine if the means are separated enough by inspecting the means and the standard deviations of these ten features to make sure the genres don't overlap much.

In [12]:
test_labels = ["classical", "metal"]
for feature in largest_percentage_std_feature.keys():
    for test_label in test_labels:
        print(f"{test_label}: {feature}")
        print("Std:", df[df['label']==test_label][feature].std())
        print("Mean:", df[df['label']==test_label][feature].mean(),"\n")
        #print("Ratio:", df[df['label']==test_label][feature].std()/df[df['label']==test_label][feature].mean(),"\n")

classical: mfcc20_mean
Std: 7.124793816951474
Mean: -0.9802276700481265 

metal: mfcc20_mean
Std: 4.268618542227908
Mean: 1.5610312489899807 

classical: mfcc14_mean
Std: 7.330789611008621
Mean: 0.09491106642891685 

metal: mfcc14_mean
Std: 4.715901592486571
Mean: 6.888745495715702 

classical: mfcc17_mean
Std: 6.161585740699798
Mean: -0.18177397555871336 

metal: mfcc17_mean
Std: 3.498086617114736
Mean: -8.284395897746085 

classical: mfcc12_mean
Std: 8.272072056731558
Mean: 0.46562456409032793 

metal: mfcc12_mean
Std: 4.4801037251050415
Mean: 13.628667895764112 

classical: perceptr_var
Std: 0.000540496036199451
Mean: 0.0002116816999566407 

metal: perceptr_var
Std: 0.004543233171184228
Mean: 0.0059080561198497404 

classical: mfcc8_mean
Std: 11.399821130209341
Mean: 1.2995191599080949 

metal: mfcc8_mean
Std: 6.04771730127954
Mean: 23.131920550107957 

classical: mfcc15_mean
Std: 6.316346617162824
Mean: -1.08080814355137 

metal: mfcc15_mean
Std: 3.9632423539752457
Mean: -9.1235799

We chose to use these features: mfcc12_mean, mfcc17_mean, mfcc8_mean, perceptr_var, mfcc15_mean, and mfcc10_mean.

In [13]:
selected_features = ["mfcc12_mean",
                     "mfcc17_mean",
                     "mfcc8_mean",
                     "mfcc15_mean",
                     "mfcc10_mean"]

filtered_features = filtered_df[selected_features]

filtered_features.reset_index(inplace=True, drop = True)

In [14]:
# Making the representations for target genres 
classical_label = [[1,0]]*998
metal_label = [[0,1]]*1000
label_representaion = np.array(classical_label + metal_label)

In [15]:
print(label_representaion.shape)
print(filtered_features.shape)

(1998, 2)
(1998, 5)


In [16]:
# Creating Train/Test Split, and we use 40 data to train the model
X_train, X_test, y_train, y_test = train_test_split(filtered_features, 
                                                    label_representaion, 
                                                    test_size=0.9795, 
                                                    random_state = 79)


In [17]:
# Normalize train data
normalized_X_train = []
for X in X_train.values:
    normalized_X = normalization(X)
    normalized_X_train.append(normalized_X)
normalized_X_train = np.array(normalized_X_train)

normalized_y_train = []
for y in y_train:
    normalized_y = normalization(y)
    normalized_y_train.append(normalized_y)
normalized_y_train = np.array(normalized_y_train)

In [18]:
X_train.shape

(40, 5)

In [19]:
X_test.shape

(1958, 5)

# Model Training

In [20]:
# Obtain the initial weight matrix
Ai_s = get_matrices(normalized_X_train, y_train)
A = matrix_sum(Ai_s)
A.shape

(2, 5)

In [21]:
trained_A = wh_algo(normalized_X_train, y_train, A)

In [22]:
# Interpret the predicted outcomes to genres
pre_train_genre = get_genre_list(A, normalized_X_train)
trained_genre = get_genre_list(trained_A, normalized_X_train)
test_genre = get_genre_list(trained_A, X_test.values)

In [23]:
# Pre-train accuracy
get_accuracy(y_train, pre_train_genre)

0.55

In [24]:
# Trained accuracy
get_accuracy(y_train, trained_genre)

0.875

In [25]:
# Test accuracy
get_accuracy(y_test, test_genre)

0.7854954034729316

# Test with Modified Audio Files

In [26]:
extractor = utils.FeatureExtractor()

In [27]:
distorted_classical_00005 = extractor.process_music("distorted_classical.00005.wav",15,3)[selected_features]
distorted_classical_00030 = extractor.process_music("distorted_classical.00030.wav",15,3)[selected_features]
distorted_classical_00048 = extractor.process_music("distorted_classical.00048.wav",15,3)[selected_features]
reversed_classical_00005 = extractor.process_music("reversed_classical.00005.wav",12,3)[selected_features]
reversed_classical_00030 = extractor.process_music("reversed_classical.00030.wav",12,3)[selected_features]
reversed_classical_00048 = extractor.process_music("reversed_classical.00048.wav")[selected_features]

distorted_metal_00000 = extractor.process_music("distorted_metal.00000.wav",15,3)[selected_features]
distorted_metal_00001 = extractor.process_music("distorted_metal.00001.wav",15,3)[selected_features]
distorted_metal_00014 = extractor.process_music("distorted_metal.00014.wav",15,3)[selected_features]
reversed_metal_00001 = extractor.process_music("reversed_metal.00001.wav",12,3)[selected_features]
reversed_metal_00001 = extractor.process_music("reversed_metal.00001.wav",12,3)[selected_features]
reversed_metal_00014 = extractor.process_music("reversed_metal.00014.wav",12,3)[selected_features]

Distorted classical audio files are often misidentified as metal because of the resemblance of waveforms between distorted effect and metal. On the other hand, reversal did not change the characteristics of the original waveform. Thereofore, the audio can be correctly categorized.

In [28]:
print(figure_genre(trained_A, distorted_classical_00005.values[0])[0])
print(figure_genre(trained_A, distorted_classical_00030.values[0])[0])
print(figure_genre(trained_A, distorted_classical_00048.values[0])[0])
print(figure_genre(trained_A, reversed_classical_00005.values[0])[0])
print(figure_genre(trained_A, reversed_classical_00030.values[0])[0])
print(figure_genre(trained_A, reversed_classical_00048.values[0])[0])

classical
metal
metal
classical
classical
classical


In [29]:
print(figure_genre(trained_A, distorted_metal_00000.values[0])[0])
print(figure_genre(trained_A, distorted_metal_00001.values[0])[0])
print(figure_genre(trained_A, distorted_metal_00014.values[0])[0])
print(figure_genre(trained_A, reversed_metal_00001.values[0])[0])
print(figure_genre(trained_A, reversed_metal_00001.values[0])[0])
print(figure_genre(trained_A, reversed_metal_00014.values[0])[0])

metal
metal
metal
metal
metal
metal
