In [23]:
import glob
import numpy as np
import librosa
from librosa.feature import zero_crossing_rate, mfcc, spectral_centroid, spectral_rolloff, spectral_bandwidth, rms
import os
from scipy import signal
import warnings
import re
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import numpy as np


In [1]:
# Constants
DEFAULT_SAMPLE_RATE = 44100
DEFAULT_FRAME_SIZE = 512

# initialize empty array for features
X = np.empty([0, 18])

# labels
y = []

def offset_audio_file(audio_file, range_size=5, sr=DEFAULT_SAMPLE_RATE):
    play_list = list()
    for offset in range(range_size):
        audio_data, sr_res = librosa.load(audio_file, sr=sr, mono=True, offset=offset, duration=5.0)
        play_list.append(audio_data)
    return play_list


def get_features(audio_data, sr=DEFAULT_SAMPLE_RATE, filter=None):

    # Teste filtrando as frequências de fala
    if filter:
        audio_data, sr = filter(audio_data, sr)
   
    zcr_feat = zero_crossing_rate(y=audio_data, hop_length=DEFAULT_FRAME_SIZE)
    rmse_feat = rms(y=audio_data, hop_length=DEFAULT_FRAME_SIZE)
    mfcc_feat = mfcc(y=audio_data, sr=sr, n_mfcc=13)
    spectral_centroid_feat = spectral_centroid(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE)
    spectral_rolloff_feat = spectral_rolloff(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE, roll_percent=0.90)
    spectral_bandwidth_feat = spectral_bandwidth(y=audio_data, sr=sr, hop_length=DEFAULT_FRAME_SIZE)

    concat_feat = np.concatenate((zcr_feat,
                                    rmse_feat,
                                    mfcc_feat,
                                    spectral_centroid_feat,
                                    spectral_rolloff_feat,
                                    spectral_bandwidth_feat
                                    ), axis=0)

    mean_feat = np.mean(concat_feat, axis=1, keepdims=True).transpose()
    return mean_feat

for audio_file in glob.glob("../baby_cry_detection/data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat = get_features(offset_audio_file(audio_file, 1)[0])

    X = np.concatenate((X, mean_feat), axis=0)

    label = os.path.dirname(audio_file)
    y.append(label)

X

array([[ 5.28295389e-02,  1.16727541e-01, -3.25538496e+02, ...,
         2.48844691e+03,  6.36518487e+03,  3.08793732e+03],
       [ 1.07062745e-01,  5.35926912e-02, -3.41089369e+02, ...,
         3.55117010e+03,  7.64853380e+03,  2.79478271e+03],
       [ 6.19176878e-02,  6.49518877e-02, -2.53313343e+02, ...,
         2.69599891e+03,  6.27470545e+03,  2.87222626e+03],
       ...,
       [ 1.20738018e-01,  3.44135986e-02, -3.14948964e+02, ...,
         3.45249575e+03,  7.49455391e+03,  2.73159057e+03],
       [ 1.18918576e-01,  2.58630056e-02, -3.49385135e+02, ...,
         3.38067918e+03,  7.43440083e+03,  2.73095362e+03],
       [ 1.24057425e-01,  2.66647073e-02, -3.53010673e+02, ...,
         3.44359460e+03,  7.43185282e+03,  2.71639762e+03]])

In [2]:
def perf(y_test, y_pred):
    acc = {'accuracy': accuracy_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred, average='macro'),
            'precision': precision_score(y_test, y_pred, average='macro'),
            'f1': f1_score(y_test, y_pred, average='macro'),
            # 'summary': classification_report(y_test, y_pred)
            }
    return acc

class TrainClassifier:
    """
    Class to train a classifier of audio signals
    """

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def train(self):
        """
        Train Random Forest

        :return: pipeline, best_param, best_estimator, perf
        """

        # Split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.25,
                                                            random_state=0,
                                                            stratify=self.y)

        pipeline = Pipeline([
            ('scl', StandardScaler()),
            # ('lda', LinearDiscriminantAnalysis()),
            ('clf', SVC(probability=True))
        ])

        # GridSearch
        param_grid = [{'clf__kernel': ['linear', 'rbf'],
                       'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'clf__gamma': np.logspace(-2, 2, 5),
                       # 'lda__n_components': range(2, 17)
                       }]

        estimator = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

        model = estimator.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        acc = perf(y_test, y_pred)

        return acc, model.best_params_, model.best_estimator_


train_classifier = TrainClassifier(X, y)
performance, parameters, best_estimator = train_classifier.train()

print(performance)
print(parameters)

model = best_estimator

{'accuracy': 0.9629629629629629, 'recall': 0.962962962962963, 'precision': 0.9660714285714286, 'f1': 0.9622966507177034}
{'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [24]:
# Bloco de apoio pra conversão do Sklearn pro Torch pra rodar no dispositivo móvel.

import sk2torch
import torch
import torch.jit
import re

# Torch precisa de y inteiro
r = re.compile('Crying baby')
y_to = [ (1 if r.search(x) else 0) for x in y ]

train_classifier_to = TrainClassifier(X, y_to)
performance_to, parameters_to, best_estimator_to = train_classifier_to.train()

torch_model = sk2torch.wrap(best_estimator_to)

torch.jit.script(torch_model).save("../app_v1/assets/models/sample1.pt")


Unnamed: 0,file,part,features,result


In [33]:
# Dados pra teste do modelo Torch no Android
results = []
for audio_file in glob.glob('my_data/*.ogg'):
    # https://stackoverflow.com/questions/41146759/check-sklearn-version-before-loading-model-using-joblib
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        
    pd.options.display.max_colwidth = 1000

    audio_part = 0
    for audio_data in offset_audio_file(audio_file):
        audio_part = audio_part + 1
        features = get_features(audio_data)
        result = best_estimator_to.predict(features)
        results.append([os.path.basename(audio_file), audio_part, features, result])

pd.DataFrame(results, columns=["file", "part", "features", "result"])



Unnamed: 0,file,part,features,result
0,bebe-brabo.ogg,1,"[[0.08425513884860789, 0.007157335261733136, -544.6787352949174, 90.04269472462948, 27.005171838603275, 16.735045505432616, -7.575647929946951, 6.1577480895889884, -5.510552885844923, 7.774845899326896, -6.985728886153488, 28.159047428803763, 1.7538337366279482, 20.563135138174776, 15.638726289822717, 4236.215382559566, 10818.860797926334, 4575.613928241699]]",[0]
1,bebe-brabo.ogg,2,"[[0.08848539914443156, 0.0042314763261132775, -571.153285192503, 87.04244416274493, 29.83797372936373, 26.956140888124498, -3.3837960125260453, 16.66951503407582, -2.3640472763790608, 16.251995559704, -9.461168996384691, 31.992553281673423, -5.509478468249126, 19.50692504436953, 12.502985830788269, 4789.834259123975, 12294.409802784223, 5036.777157028667]]",[0]
2,bebe-brabo.ogg,3,"[[0.09432438188805105, 0.0026152288636274792, -583.9935496744037, 66.71974784868776, 40.471280088695735, 31.402940569510317, 5.02442548273611, 19.108926489326905, 6.190675983875298, 18.466067627003188, -2.1856191158640415, 28.78130973740686, -2.816222555278487, 18.72916259898387, 11.067213150886538, 5256.178351929832, 13556.425373404873, 5421.542746158444]]",[0]
3,bebe-brabo.ogg,4,"[[0.093248123912413, 0.003475363168183697, -556.3235873689231, 72.21590171336021, 34.8943733120739, 32.29676444486233, 2.4016527761024555, 19.877979822992724, 3.3730397508792977, 19.0509745278104, -3.7527039971055673, 27.58668970785119, -2.9930687660476725, 16.5364059041278, 9.693944237348376, 5057.922205540029, 13328.103474840487, 5312.0724311287595]]",[0]
4,bebe-brabo.ogg,5,"[[0.1041897023636891, 0.0034548881344320017, -532.4526086439943, 70.02801130431988, 20.533173541251024, 34.674429678308435, 4.3502541640795025, 23.956170634023, 4.02419294752267, 22.188594037584807, -2.9960259092829067, 24.722001386076006, -4.6762078073212825, 13.058255686578093, 7.810303989737327, 5106.237937691021, 13152.79022712442, 5147.323186051407]]",[0]
5,bebe-chorando-banheiro.ogg,1,"[[0.09838470671403712, 0.06194486488499726, -284.2549179147955, 90.7430011457191, -48.58120716772887, 28.129970379121897, -12.112669431278158, 13.186070172665014, -10.377997044966559, 6.525407430813097, -6.712600370415264, 18.049913387564107, -11.466323294730861, -0.9137046874399252, 0.5546297992465115, 3517.9014754978857, 8226.932619453306, 3339.5359036901355]]",[1]
6,bebe-chorando-banheiro.ogg,2,"[[0.11057814312645012, 0.07530113516954529, -262.6048180210618, 74.87526413016973, -67.99419888221333, 29.122101231890877, -15.192160737279396, 13.9318322540021, -12.040361121694477, 8.59440400243981, -9.050685437694364, 19.643280636337682, -14.161382625592557, -4.1532564448439055, 0.2546006609834817, 3717.6696970859366, 8315.663405416184, 3237.9746217278066]]",[1]
7,bebe-chorando-banheiro.ogg,3,"[[0.10983609157482599, 0.07023516193538983, -278.65378636052606, 76.92952067337568, -79.80631651139868, 25.19803363582651, -22.514821050822597, 12.83050287450964, -13.629122059541622, 14.111458537542626, -8.306443527859491, 26.17113958869622, -15.95860227264827, -4.14816199865391, 2.891903721401144, 3750.443991241247, 8438.217807424595, 3250.057595848887]]",[1]
8,bebe-chorando-banheiro.ogg,4,"[[0.0959965469112529, 0.07186073555477102, -272.8041924213312, 84.11951746354246, -69.16127971298734, 14.73773888306662, -23.860487569512458, 7.9225418297984485, -17.906643196560115, 10.817874332245983, -2.6080032555935277, 25.40876265551264, -15.60227555078033, -6.6987134924897465, 0.7488582645242563, 3573.226326124119, 8609.034562608758, 3325.368963150297]]",[1]
9,bebe-chorando-banheiro.ogg,5,"[[0.09078179379350348, 0.07250916105808514, -267.6925523585344, 87.57161292675752, -70.11815839259089, 10.908509067897176, -21.735198734946152, 5.414414388812611, -18.854936808330415, 13.666062008511716, -6.567422118738052, 27.154152466083623, -20.489693731380726, -6.096137610326787, 1.7763270675323541, 3494.3938854867483, 8649.353112311484, 3349.1329162984302]]",[1]


In [4]:
# Teste com SR = 8000
X2 = np.empty([0, 18])
SR2 = 8000
for audio_file in glob.glob("../baby_cry_detection/data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat2 = get_features(offset_audio_file(audio_file, 1, sr=SR2)[0], SR2)

    X2 = np.concatenate((X2, mean_feat2), axis=0)

X2


array([[ 1.73611798e-01,  1.23150759e-01, -1.99177740e+02, ...,
         9.60125461e+02,  1.90521163e+03,  7.13332409e+02],
       [ 4.52679984e-01,  5.16662143e-02, -2.74948316e+02, ...,
         1.99538283e+03,  3.10690269e+03,  8.79805382e+02],
       [ 2.50828224e-01,  7.16511191e-02, -1.28393078e+02, ...,
         1.46855721e+03,  2.95886076e+03,  9.87784581e+02],
       ...,
       [ 3.29620748e-01,  3.63898407e-02, -2.03492699e+02, ...,
         1.59079472e+03,  2.91821598e+03,  9.08250434e+02],
       [ 3.37680479e-01,  2.78631518e-02, -2.44170320e+02, ...,
         1.56909863e+03,  2.87435720e+03,  8.99650026e+02],
       [ 3.50703372e-01,  2.87499311e-02, -2.48085469e+02, ...,
         1.60195184e+03,  2.88548259e+03,  8.90115800e+02]])

In [5]:
len(X)

432

In [6]:
train_classifier2 = TrainClassifier(X2, y)
performance2, parameters2, best_estimator2 = train_classifier2.train()

print("Teste sr=8000")
print(performance2)
print(parameters2)

model2 = best_estimator2

Teste sr=8000
{'accuracy': 0.9259259259259259, 'recall': 0.9259259259259258, 'precision': 0.935064935064935, 'f1': 0.9249999999999999}
{'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [7]:
# Não tá funcionando com mp3 pq não consegui colocar o ffmpeg no env do jupyter. 
# Como contorno converti todos os testes pra ogg via ffmpeg manualmente. Ogg funciona normalmente.

class BabyCryPredictor:
    """
    Class to classify a new audio signal and determine if it's a baby cry
    """

    def __init__(self, model):
        self.model = model

    def classify(self, new_signal):
        """
        Make prediction with trained model

        :param new_signal: 1d array, 34 features
        :return: 1 (it's baby cry); 0 (it's not a baby cry)
        """

        category = self.model.predict(new_signal)

        # category is an array of the kind array(['004 - Baby cry'], dtype=object)
        return self._is_baby_cry(category[0])

    @staticmethod
    def _is_baby_cry(value):
        """
        String analysis to detect if it is the baby cry category
        :param string: output of model prediction as string
        :return: 1 (it's baby cry); 0 (it's not a baby cry)
        """

        # Para o Torch a predição resposta tem que ser int. Daí já deixei a classe preparada.
        if value is int:
            return value

        #print(string)
        match = re.search('Crying baby', value)

        if match:
            return 1
        else:
            return 0


class MajorityVoter:
    """
    Class to make a majority vote over multiple (5 or more? odd number anyway) classifications
    """

    def __init__(self, prediction_list):
        self.predictions = prediction_list

    def vote(self):
        """
        Overall prediction

        :return: 1 if more than half predictions are 1s
        """

        if sum(self.predictions) > len(self.predictions)/2.0:
            return 1
        else:
            return 0

def predict(model, path, sr=DEFAULT_SAMPLE_RATE, filter=None):

    print("Predicting...", sr, path)

    predictor = BabyCryPredictor(model)

    results = []

    for audio_file in glob.glob(path):
        play_list = offset_audio_file(audio_file, sr=sr)
        play_list_processed = list()

        for signal in play_list:
            tmp = get_features(signal, sr=sr, filter=filter)
            play_list_processed.append(tmp)

        # https://stackoverflow.com/questions/41146759/check-sklearn-version-before-loading-model-using-joblib
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)

        predictions = list()

        for signal in play_list_processed:
            tmp = predictor.classify(signal)
            predictions.append(tmp)

        # MAJORITY VOTE

        majority_voter = MajorityVoter(predictions)
        majority_vote = majority_voter.vote()

        results.append((os.path.basename(audio_file), majority_vote, predictions))

    print("Done")

    return pd.DataFrame(results, columns=["file", "result", "partials"])
    #return np.array(results, dtype=[('file', 'U25'), ('after_voting', np.bool_), ('before_voting', np.bool_, (1, 5))])

predict(model, "my_data/*.ogg")


Predicting... 44100 my_data/*.ogg
Done


Unnamed: 0,file,result,partials
0,bebe-brabo.ogg,0,"[0, 0, 0, 0, 0]"
1,bebe-chorando-banheiro.ogg,1,"[1, 1, 1, 1, 1]"
2,bebe-chorando.ogg,1,"[0, 0, 1, 1, 1]"
3,bebe-falando.ogg,0,"[0, 0, 0, 0, 0]"


In [8]:
predict(model2, "my_data/*.ogg", SR2)


Predicting... 8000 my_data/*.ogg
Done


Unnamed: 0,file,result,partials
0,bebe-brabo.ogg,0,"[0, 0, 0, 0, 0]"
1,bebe-chorando-banheiro.ogg,1,"[1, 1, 1, 1, 1]"
2,bebe-chorando.ogg,1,"[1, 1, 1, 1, 1]"
3,bebe-falando.ogg,0,"[0, 0, 0, 0, 0]"


In [9]:
# O corpus é em sr=8000

result_df = predict(model2, "../donateacry-corpus/donateacry_corpus_cleaned_and_updated_data/*/*", SR2)

result_df


Predicting... 8000 ../donateacry-corpus/donateacry_corpus_cleaned_and_updated_data/*/*
Done


Unnamed: 0,file,result,partials
0,549a46d8-9c84-430e-ade8-97eae2bef787-143013077...,0,"[0, 0, 0, 0, 0]"
1,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013849...,0,"[0, 0, 0, 0, 0]"
2,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013850...,0,"[0, 0, 0, 0, 0]"
3,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013851...,0,"[0, 0, 0, 0, 0]"
4,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013852...,1,"[1, 1, 1, 1, 1]"
...,...,...,...
452,d5abedab-9ed5-48d0-a83b-fc00c33d6d6b-143007919...,0,"[0, 0, 0, 0, 0]"
453,d6cda191-4962-4308-9a36-46d5648a95ed-143158788...,1,"[0, 1, 1, 1, 1]"
454,d6cda191-4962-4308-9a36-46d5648a95ed-143158789...,0,"[0, 0, 0, 0, 0]"
455,d6cda191-4962-4308-9a36-46d5648a95ed-143161658...,1,"[1, 1, 1, 1, 1]"


In [10]:
# Coloquei só alguns zeros pra parar o alerta de cálculo viciado (deixando um pouco mais viciado hehehe)
acc = perf(np.concatenate((result_df['result'], [0,0,0,0])), np.concatenate((np.full(len(result_df), 1), [0,0,0,0])))

print(acc)


{'accuracy': 0.42733188720173537, 'recall': 0.5074626865671642, 'precision': 0.7111597374179431, 'f1': 0.3116289592760181}


In [17]:
# Teste com filtro de banda

lowcut = 250.0
highcut = 1000.0

# https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html
def bandpass_filter(audio_data, sr):
    order = 3.0  #3, 6 , 9
    # 6 ficou instável

    nyq = 0.5 * float(sr)
    low = lowcut / nyq
    high = highcut / nyq
    b, a = signal.butter(order, [low, high], btype='band')

    y = signal.lfilter(b, a, audio_data)
    
    return y, sr    

X3 = np.empty([0, 18])
y3 = []
for audio_file in glob.glob("../baby_cry_detection/data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat = get_features(offset_audio_file(audio_file, 1)[0], filter=bandpass_filter)

    X3 = np.concatenate((X3, mean_feat), axis=0)

    label = os.path.dirname(audio_file)
    y3.append(label)

train_classifier3 = TrainClassifier(X3, y3)
performance3, parameters3, best_estimator3 = train_classifier2.train()

print("Teste sr=", DEFAULT_SAMPLE_RATE, 'filtered')
print(performance3)
print(parameters3)

model3 = best_estimator3



Teste sr= 44100 filtered
{'accuracy': 0.9259259259259259, 'recall': 0.9259259259259258, 'precision': 0.935064935064935, 'f1': 0.9249999999999999}
{'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [18]:
# Teste com SR = 8000 e filtro
X4 = np.empty([0, 18])
for audio_file in glob.glob("../baby_cry_detection/data/**/*.*"):
    ### TODO: Checar se tem que usar todo o offset
    mean_feat2 = get_features(offset_audio_file(audio_file, 1, sr=SR2)[0], SR2, filter=bandpass_filter)

    X4 = np.concatenate((X4, mean_feat2), axis=0)

train_classifier4 = TrainClassifier(X4, y3)
performance4, parameters4, best_estimator4 = train_classifier4.train()

print("Teste sr=", SR2, 'filtered')
print(performance4)
print(parameters4)

model4 = best_estimator4

Teste sr= 8000 filtered
{'accuracy': 0.9259259259259259, 'recall': 0.9259259259259259, 'precision': 0.9276656314699794, 'f1': 0.9247527910685805}
{'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [19]:
predict(model3, "my_data/*.ogg", filter=bandpass_filter)


Predicting... 44100 my_data/*.ogg
Done


Unnamed: 0,file,result,partials
0,bebe-brabo.ogg,0,"[0, 0, 0, 0, 0]"
1,bebe-chorando-banheiro.ogg,0,"[0, 0, 0, 0, 0]"
2,bebe-chorando.ogg,0,"[0, 0, 0, 0, 0]"
3,bebe-falando.ogg,0,"[0, 0, 0, 0, 0]"


In [20]:
predict(model4, "my_data/*.ogg", SR2, filter=bandpass_filter)


Predicting... 8000 my_data/*.ogg
Done


Unnamed: 0,file,result,partials
0,bebe-brabo.ogg,0,"[0, 0, 0, 0, 0]"
1,bebe-chorando-banheiro.ogg,1,"[1, 1, 1, 1, 1]"
2,bebe-chorando.ogg,1,"[1, 1, 1, 1, 1]"
3,bebe-falando.ogg,0,"[0, 0, 0, 0, 0]"


In [21]:
result_df4 = predict(model4, "../donateacry-corpus/donateacry_corpus_cleaned_and_updated_data/*/*", SR2, filter=bandpass_filter)

result_df4

Predicting... 8000 ../donateacry-corpus/donateacry_corpus_cleaned_and_updated_data/*/*
Done


Unnamed: 0,file,result,partials
0,549a46d8-9c84-430e-ade8-97eae2bef787-143013077...,1,"[1, 1, 1, 1, 0]"
1,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013849...,0,"[0, 0, 0, 0, 0]"
2,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013850...,0,"[0, 0, 0, 0, 1]"
3,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013851...,1,"[1, 1, 1, 1, 1]"
4,643D64AD-B711-469A-AF69-55C0D5D3E30F-143013852...,1,"[1, 1, 1, 1, 0]"
...,...,...,...
452,d5abedab-9ed5-48d0-a83b-fc00c33d6d6b-143007919...,0,"[0, 0, 0, 0, 0]"
453,d6cda191-4962-4308-9a36-46d5648a95ed-143158788...,1,"[0, 0, 1, 1, 1]"
454,d6cda191-4962-4308-9a36-46d5648a95ed-143158789...,1,"[1, 1, 1, 1, 1]"
455,d6cda191-4962-4308-9a36-46d5648a95ed-143161658...,1,"[1, 1, 1, 1, 1]"


In [22]:
# Coloquei só alguns zeros pra parar o alerta de cálculo viciado (deixando um pouco mais viciado hehehe)
acc4 = perf(np.concatenate((result_df4['result'], [0,0,0,0])), np.concatenate((np.full(len(result_df4), 1), [0,0,0,0])))

print(acc4)

# 0,50... na moeda seria mais preciso.


{'accuracy': 0.5097613882863341, 'recall': 0.508695652173913, 'precision': 0.7527352297592997, 'f1': 0.3528498310475055}
