In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import librosa
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [None]:
dog_bark = []
say_go = []
baby_cry = []
# Sample rate
sr = 22050

In [None]:
def draw_hist(folder):
    d = []
    f = os.path.join('/kaggle/input/freesound-audio-ds-project/ProcessedAudio/' ,folder)
    reprocess = []
    for filename in os.listdir(f):
        try:
            y, sr = librosa.load(os.path.join(f, filename))
            du = librosa.get_duration(y, sr)
            d.append(du)
        except:
            continue
    plt.hist(d, bins = 20)
    plt.show()

In [None]:
labels = ['dog bark', 'say go', 'baby cry']

In [None]:
def get_dataset(folder):
    X = []
    ys = []
    f = os.path.join('/kaggle/input/preprocess-audio/' ,folder)
    for filename in os.listdir(f):
        try:
            y, _ = librosa.load(os.path.join(f, filename), sr = sr)
            X.append(y)
            ys.append(int(labels.index(folder)))
        except:
            print(folder + "/" + filename)
    return X, ys

In [None]:
# Get all data
X = []
y = []
for folder in labels:
    X_folder, y_folder = get_dataset(folder)
    X += X_folder
    y += y_folder

In [None]:
class FixLength(BaseEstimator, TransformerMixin):
    def __init__(self, length):
        self.length = length * sr
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X contains tupes of y and sr
        res = []
        for y in X:
            y_fixed = librosa.util.fix_length(y, self.length)
            res.append(y_fixed)
        return res

In [None]:
class STFT(BaseEstimator, TransformerMixin):
    def __init__(self, n_fft = 512):
        self.n_fft = n_fft
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        res = []
        for y in X:
            y_fft = librosa.amplitude_to_db(np.abs(librosa.core.stft(y, n_fft = self.n_fft, hop_length=1000))).flatten()
            res.append(y_fft)
        return res

In [None]:
fix_length = FixLength(3)
sfft = STFT()
onehot = OneHotEncoder(handle_unknown='ignore')
X_processed = fix_length.fit_transform(X)
X_processed = sfft.fit_transform(X_processed)
y_processed = onehot.fit_transform(np.array(y).reshape(-1,1))

In [None]:
X_processed[1].shape

In [None]:
# Train val split
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size = 0.2, random_state = 0)

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100,50,20,3), activation='tanh', solver='lbfgs', random_state=0, max_iter=500, verbose=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_val, y_val)