In [17]:
import glob
import numpy as np
import librosa
from librosa.feature import zero_crossing_rate, mfcc, spectral_centroid, spectral_rolloff, spectral_bandwidth, rms
import os
from scipy import signal

# Constants
DEFAULT_SAMPLE_RATE = 44100
DEFAULT_FRAME_SIZE = 512

# initialize empty array for features
X = np.empty([0, 18])

# labels
y = []

def offset_audio_file(audio_file, range_size=5, sr=DEFAULT_SAMPLE_RATE):
    play_list = list()
    for offset in range(range_size):
        audio_data, sr_res = librosa.load(audio_file, sr=sr, mono=True, offset=offset, duration=5.0)
        play_list.append(audio_data)
    return play_list


def get_features(audio_data, sr=DEFAULT_SAMPLE_RATE, filter=None):

    # Teste filtrando as frequências de fala
    if filter:
        audio_data, sr = filter(audio_data, sr)
   
    zcr_feat = zero_crossing_rate(y=audio_data, hop_length=DEFAULT_FRAME_SIZE)
    rmse_feat = rms(y=audio_data, hop_length=DEFAULT_FRAME_SIZE)
    mfcc_feat = mfcc(y=audio_data, sr=sr, n_mfcc=13)
    spectral_centroid_feat = spectral_centroid(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE)
    spectral_rolloff_feat = spectral_rolloff(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE, roll_percent=0.90)
    spectral_bandwidth_feat = spectral_bandwidth(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE)

    concat_feat = np.concatenate((zcr_feat,
                                    rmse_feat,
                                    mfcc_feat,
                                    spectral_centroid_feat,
                                    spectral_rolloff_feat,
                                    spectral_bandwidth_feat
                                    ), axis=0)

    mean_feat = np.mean(concat_feat, axis=1, keepdims=True).transpose()
    return mean_feat

for audio_file in glob.glob("data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat = get_features(offset_audio_file(audio_file, 1)[0])

    X = np.concatenate((X, mean_feat), axis=0)

    label = os.path.dirname(audio_file)
    y.append(label)

X

array([[ 5.28295389e-02,  1.16727541e-01, -3.25538496e+02, ...,
         2.48844691e+03,  6.36518487e+03,  3.08793732e+03],
       [ 1.07062745e-01,  5.35926912e-02, -3.41089369e+02, ...,
         3.55117010e+03,  7.64853380e+03,  2.79478271e+03],
       [ 6.19176878e-02,  6.49518877e-02, -2.53313343e+02, ...,
         2.69599891e+03,  6.27470545e+03,  2.87222626e+03],
       ...,
       [ 1.20738018e-01,  3.44135986e-02, -3.14948964e+02, ...,
         3.45249575e+03,  7.49455391e+03,  2.73159057e+03],
       [ 1.18918576e-01,  2.58630056e-02, -3.49385135e+02, ...,
         3.38067918e+03,  7.43440083e+03,  2.73095362e+03],
       [ 1.24057425e-01,  2.66647073e-02, -3.53010673e+02, ...,
         3.44359460e+03,  7.43185282e+03,  2.71639762e+03]])

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import numpy as np

def perf(y_test, y_pred):
    acc = {'accuracy': accuracy_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred, average='macro'),
            'precision': precision_score(y_test, y_pred, average='macro'),
            'f1': f1_score(y_test, y_pred, average='macro'),
            # 'summary': classification_report(y_test, y_pred)
            }
    return acc

class TrainClassifier:
    """
    Class to train a classifier of audio signals
    """

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def train(self):
        """
        Train Random Forest

        :return: pipeline, best_param, best_estimator, perf
        """

        # Split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.25,
                                                            random_state=0,
                                                            stratify=self.y)

        pipeline = Pipeline([
            ('scl', StandardScaler()),
            # ('lda', LinearDiscriminantAnalysis()),
            ('clf', SVC(probability=True))
        ])

        # GridSearch
        param_grid = [{'clf__kernel': ['linear', 'rbf'],
                       'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'clf__gamma': np.logspace(-2, 2, 5),
                       # 'lda__n_components': range(2, 17)
                       }]

        estimator = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

        model = estimator.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        acc = perf(y_test, y_pred)

        return acc, model.best_params_, model.best_estimator_


train_classifier = TrainClassifier(X, y)
performance, parameters, best_estimator = train_classifier.train()

print(performance)
print(parameters)

model = best_estimator

{'accuracy': 0.9629629629629629, 'recall': 0.962962962962963, 'precision': 0.9660714285714286, 'f1': 0.9622966507177034}
{'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [20]:
import sk2torch
import torch
import torch.jit
import re

r = re.compile('Crying baby')
y_to = [ (1 if r.search(x) else 0) for x in y ]

train_classifier_to = TrainClassifier(X, y_to)
performance_to, parameters_to, best_estimator_to = train_classifier_to.train()

print(performance_to)
print(parameters_to)

torch_model = sk2torch.wrap(best_estimator_to)

torch.jit.script(torch_model).save("E:\\Source\\cryingbaby\\ninando\\assets\\models\\sample1.pt")

(array([0, 1]), array([324, 108], dtype=int64))
{'accuracy': 0.9907407407407407, 'recall': 0.9814814814814814, 'precision': 0.9939024390243902, 'f1': 0.9874985530732724}
{'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [21]:
# Teste com SR = 8000
X2 = np.empty([0, 18])
SR2 = 8000
for audio_file in glob.glob("data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat2 = get_features(offset_audio_file(audio_file, 1, sr=SR2)[0], SR2)

    X2 = np.concatenate((X2, mean_feat2), axis=0)

X2


array([[ 1.73611798e-01,  1.23150759e-01, -1.99177740e+02, ...,
         9.60125461e+02,  1.90521163e+03,  7.13332409e+02],
       [ 4.52679984e-01,  5.16662143e-02, -2.74948316e+02, ...,
         1.99538283e+03,  3.10690269e+03,  8.79805382e+02],
       [ 2.50828224e-01,  7.16511191e-02, -1.28393078e+02, ...,
         1.46855721e+03,  2.95886076e+03,  9.87784581e+02],
       ...,
       [ 3.29620748e-01,  3.63898407e-02, -2.03492699e+02, ...,
         1.59079472e+03,  2.91821598e+03,  9.08250434e+02],
       [ 3.37680479e-01,  2.78631518e-02, -2.44170320e+02, ...,
         1.56909863e+03,  2.87435720e+03,  8.99650026e+02],
       [ 3.50703372e-01,  2.87499311e-02, -2.48085469e+02, ...,
         1.60195184e+03,  2.88548259e+03,  8.90115800e+02]])

In [24]:
len(X)

432

In [23]:
train_classifier2 = TrainClassifier(X2, y_to)
performance2, parameters2, best_estimator2 = train_classifier2.train()

print("Teste sr=8000")
print(performance2)
print(parameters2)

model2 = best_estimator2

Teste sr=8000
{'accuracy': 0.9537037037037037, 'recall': 0.9197530864197531, 'precision': 0.9553571428571428, 'f1': 0.9358288770053476}
{'clf__C': 100, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}


In [57]:
# Não tá funcionando com mp3 pq não consegui colocar o ffmpeg no env do jupyter. 
# Como contorno converti todos os testes pra ogg via ffmpeg manualmente. Ogg funciona normalmente.

import warnings
import re
import pandas as pd

class BabyCryPredictor:
    """
    Class to classify a new audio signal and determine if it's a baby cry
    """

    def __init__(self, model):
        self.model = model

    def classify(self, new_signal):
        """
        Make prediction with trained model

        :param new_signal: 1d array, 34 features
        :return: 1 (it's baby cry); 0 (it's not a baby cry)
        """

        category = self.model.predict(new_signal)

        # category is an array of the kind array(['004 - Baby cry'], dtype=object)
        return self._is_baby_cry(category[0])

    @staticmethod
    def _is_baby_cry(string):
        """
        String analysis to detect if it is the baby cry category
        :param string: output of model prediction as string
        :return: 1 (it's baby cry); 0 (it's not a baby cry)
        """

        #print(string)
        match = re.search('Crying baby', string)

        if match:
            return 1
        else:
            return 0


class MajorityVoter:
    """
    Class to make a majority vote over multiple (5 or more? odd number anyway) classifications
    """

    def __init__(self, prediction_list):
        self.predictions = prediction_list

    def vote(self):
        """
        Overall prediction

        :return: 1 if more than half predictions are 1s
        """

        if sum(self.predictions) > len(self.predictions)/2.0:
            return 1
        else:
            return 0

def predict(model, path, sr=DEFAULT_SAMPLE_RATE, filter=None):

    print("Predicting...", sr, path)

    predictor = BabyCryPredictor(model)

    results = []

    for audio_file in glob.glob(path):
        play_list = offset_audio_file(audio_file, sr=sr)
        play_list_processed = list()

        for signal in play_list:
            tmp = get_features(signal, sr=sr, filter=filter)
            play_list_processed.append(tmp)

        # https://stackoverflow.com/questions/41146759/check-sklearn-version-before-loading-model-using-joblib
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)

        predictions = list()

        for signal in play_list_processed:
            tmp = predictor.classify(signal)
            predictions.append(tmp)

        # MAJORITY VOTE

        majority_voter = MajorityVoter(predictions)
        majority_vote = majority_voter.vote()

        results.append((os.path.basename(audio_file), majority_vote, predictions))

    print("Done")

    return pd.DataFrame(results, columns=["file", "result", "partials"])
    #return np.array(results, dtype=[('file', 'U25'), ('after_voting', np.bool_), ('before_voting', np.bool_, (1, 5))])

predict(model, "baby_cry_detection\\prediction_simulation\\*.ogg")


Predicting... 44100 baby_cry_detection\prediction_simulation\*.ogg
Done


Unnamed: 0,file,result,partials
0,brabo-livro.ogg,0,"[0, 0, 0, 0, 0]"
1,choro1.ogg,1,"[1, 1, 1, 1, 1]"
2,grama-falando.ogg,0,"[0, 0, 0, 0, 0]"
3,V_2017-04-01+08_04_36=0_13.ogg,1,"[0, 0, 1, 1, 1]"


In [58]:
predict(model2, "baby_cry_detection\\prediction_simulation\\*.ogg", SR2)


Predicting... 8000 baby_cry_detection\prediction_simulation\*.ogg
Done


Unnamed: 0,file,result,partials
0,brabo-livro.ogg,0,"[0, 0, 0, 0, 0]"
1,choro1.ogg,1,"[1, 1, 1, 1, 1]"
2,grama-falando.ogg,0,"[0, 0, 0, 0, 0]"
3,V_2017-04-01+08_04_36=0_13.ogg,1,"[1, 1, 1, 1, 1]"


In [59]:
# O corpus é em sr=8000

result_df = predict(model2, "..\\donateacry-corpus\\donateacry_corpus_cleaned_and_updated_data\\*\\*", SR2)

result_df


Predicting... 8000 ..\donateacry-corpus\donateacry_corpus_cleaned_and_updated_data\*\*
Done


Unnamed: 0,file,result,partials
0,549a46d8-9c84-430e-ade8-97eae2bef787-143013077...,0,"[0, 0, 0, 0, 0]"
1,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013849...,0,"[0, 0, 0, 0, 0]"
2,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013850...,0,"[0, 0, 0, 0, 0]"
3,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013851...,0,"[0, 0, 0, 0, 0]"
4,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013852...,1,"[1, 1, 1, 1, 1]"
...,...,...,...
452,d5abedab-9ed5-48d0-a83b-fc00c33d6d6b-143007919...,0,"[0, 0, 0, 0, 0]"
453,d6cda191-4962-4308-9a36-46d5648a95ed-143158788...,1,"[0, 1, 1, 1, 1]"
454,d6cda191-4962-4308-9a36-46d5648a95ed-143158789...,0,"[0, 0, 0, 0, 0]"
455,d6cda191-4962-4308-9a36-46d5648a95ed-143161658...,1,"[1, 1, 1, 1, 1]"


In [60]:
# Coloquei só alguns zeros pra parar o alerta de cálculo viciado (deixando um pouco mais viciado hehehe)
acc = perf(np.concatenate((result_df['result'], [0,0,0,0])), np.concatenate((np.full(len(result_df), 1), [0,0,0,0])))

print(acc)


{'accuracy': 0.42733188720173537, 'recall': 0.5074626865671642, 'precision': 0.7111597374179431, 'f1': 0.3116289592760181}


In [62]:
# Teste com filtro de banda

lowcut = 250.0
highcut = 1000.0

# https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html
def bandpass_filter(audio_data, sr):
    order = 3.0  #3, 6 , 9
    # 6 ficou instável

    nyq = 0.5 * float(sr)
    low = lowcut / nyq
    high = highcut / nyq
    b, a = signal.butter(order, [low, high], btype='band')

    y = signal.lfilter(b, a, audio_data)
    
    return y, sr    

X3 = np.empty([0, 18])
y3 = []
for audio_file in glob.glob("data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat = get_features(offset_audio_file(audio_file, 1)[0], filter=bandpass_filter)

    X3 = np.concatenate((X3, mean_feat), axis=0)

    label = os.path.dirname(audio_file)
    y3.append(label)

train_classifier3 = TrainClassifier(X3, y3)
performance3, parameters3, best_estimator3 = train_classifier2.train()

print("Teste sr=", DEFAULT_SAMPLE_RATE, 'filtered')
print(performance3)
print(parameters3)

model3 = best_estimator3



Teste sr= 44100 filtered
{'accuracy': 0.9259259259259259, 'recall': 0.9259259259259258, 'precision': 0.935064935064935, 'f1': 0.9249999999999999}
{'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [71]:
# Teste com SR = 8000 e filtro
X4 = np.empty([0, 18])
for audio_file in glob.glob("data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat2 = get_features(offset_audio_file(audio_file, 1, sr=SR2)[0], SR2, filter=bandpass_filter)

    X4 = np.concatenate((X4, mean_feat2), axis=0)

train_classifier4 = TrainClassifier(X4, y3)
performance4, parameters4, best_estimator4 = train_classifier4.train()

print("Teste sr=", SR2, 'filtered')
print(performance4)
print(parameters4)

model4 = best_estimator4

Teste sr= 8000 filtered
{'accuracy': 0.9259259259259259, 'recall': 0.9259259259259259, 'precision': 0.9276656314699794, 'f1': 0.9247527910685805}
{'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [74]:
predict(model3, "baby_cry_detection\\prediction_simulation\\*.ogg", filter=bandpass_filter)


Predicting... 44100 baby_cry_detection\prediction_simulation\*.ogg
Done


Unnamed: 0,file,result,partials
0,brabo-livro.ogg,0,"[0, 0, 0, 0, 0]"
1,choro1.ogg,0,"[0, 0, 0, 0, 0]"
2,grama-falando.ogg,0,"[0, 0, 0, 0, 0]"
3,V_2017-04-01+08_04_36=0_13.ogg,0,"[0, 0, 0, 0, 0]"


In [65]:
predict(model4, "baby_cry_detection\\prediction_simulation\\*.ogg", SR2, filter=bandpass_filter)


Predicting... 8000 baby_cry_detection\prediction_simulation\*.ogg
Done


Unnamed: 0,file,result,partials
0,brabo-livro.ogg,0,"[0, 0, 0, 0, 0]"
1,choro1.ogg,1,"[1, 1, 1, 1, 1]"
2,grama-falando.ogg,0,"[0, 0, 0, 0, 0]"
3,V_2017-04-01+08_04_36=0_13.ogg,1,"[1, 1, 1, 1, 1]"


In [66]:
result_df4 = predict(model4, "..\\donateacry-corpus\\donateacry_corpus_cleaned_and_updated_data\\*\\*", SR2, filter=bandpass_filter)

result_df4

Predicting... 8000 ..\donateacry-corpus\donateacry_corpus_cleaned_and_updated_data\*\*
Done


Unnamed: 0,file,result,partials
0,549a46d8-9c84-430e-ade8-97eae2bef787-143013077...,1,"[1, 1, 1, 1, 0]"
1,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013849...,0,"[0, 0, 0, 0, 0]"
2,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013850...,0,"[0, 0, 0, 0, 1]"
3,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013851...,1,"[1, 1, 1, 1, 1]"
4,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013852...,1,"[1, 1, 1, 1, 0]"
...,...,...,...
452,d5abedab-9ed5-48d0-a83b-fc00c33d6d6b-143007919...,0,"[0, 0, 0, 0, 0]"
453,d6cda191-4962-4308-9a36-46d5648a95ed-143158788...,1,"[0, 0, 1, 1, 1]"
454,d6cda191-4962-4308-9a36-46d5648a95ed-143158789...,1,"[1, 1, 1, 1, 1]"
455,d6cda191-4962-4308-9a36-46d5648a95ed-143161658...,1,"[1, 1, 1, 1, 1]"


In [67]:
# Coloquei só alguns zeros pra parar o alerta de cálculo viciado (deixando um pouco mais viciado hehehe)
acc4 = perf(np.concatenate((result_df4['result'], [0,0,0,0])), np.concatenate((np.full(len(result_df4), 1), [0,0,0,0])))

print(acc4)


{'accuracy': 0.5097613882863341, 'recall': 0.508695652173913, 'precision': 0.7527352297592997, 'f1': 0.3528498310475055}
