In [0]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd
import librosa
import librosa.display
import os
from os import path
import io

# Opening datasets

In [0]:
complete_sound_df = pd.read_csv('/content/drive/My Drive/DCASE/Datasets/file_label_dcase.csv')
complete_sound_df = pd.read_csv(io.StringIO(complete_sound_df.to_csv(index=False)),index_col=0)
complete_sound_df.head()

# Getting part of the dataset
Não coloquei 10% dele, pois poderiam vir mais datasets de um label específico. E nem coloquei 10% de cada label, pois a quantidades de datasets dentro de cada label é diferente. Então defini n datasets de cada label.

In [0]:
df = complete_sound_df.groupby('label', as_index=False).apply(lambda x: x.sample(n = 1, random_state=42))
df.index = [i[1] for i in df.index]
df.head()

In [0]:
print(df.loc[df['label'] == 'outdoor'].shape)
print(df.loc[df['label'] == 'indoor'].shape)
print(df.loc[df['label'] == 'transportation'].shape)

In [0]:
def get_datasets(urls, labels, indexes): 
    i = 1 
    for url, label , i in zip(urls, labels, indexes): 
        label = label.capitalize() 
        dirname = "/content/drive/My\ Drive/DCASE/Datasets/"f"{label}" 
        filename = dirname + '/dataset'f'{i}.zip' 
        #print(path.isfile('/content/drive/My Drive/DCASE/Datasets/{0}/dataset{1}.zip'.format(label, i))) 
        if not (path.isfile('/content/drive/My Drive/DCASE/Datasets/{0}/dataset{1}.zip'.format(label, i))): 
            !wget -O $filename $url
            !unzip $filename -d $dirname
            print('Im goind to download it') 
            continue 
        else: 
            print('This dataset already exists') 
        break
        i += 1



In [0]:
get_datasets(df['file'], df['label'], df.index)

In [0]:
!unzip /content/drive/My\ Drive/DCASE/Datasets/Outdoor/dataset7656.zip -d /content/drive/My\ Drive/DCASE/Datasets/Outdoor/

In [0]:
!ls /content/drive/My\ Drive/DCASE/Datasets/Outdoor/

Reading the datasets and storing it to 'signals'

In [0]:
def import_signal(path):
    s, sr = librosa.core.load(path, sr=48000)
    return s

In [0]:
labels = ['Indoor', 'Outdoor', 'Transportation']
rotulos = []
signals = []
dic_signals = {x:[] for i, x in enumerate(labels)}
for l in labels:
    for dirname, _, filenames in os.walk('/content/drive/My Drive/DCASE/Datasets/{}/TAU-urban-acoustic-scenes-2020-3class-development/audio'.format(l)):
        for filename in filenames:
            index_label = [labels.index(item) for item in labels if item == l]
            rotulos.append(index_label[0])
            #dic_signals[l].append(index_label[0])
            
            signals.append(import_signal(os.path.join(dirname, filename)))




Labels:


0.   Indoor
1.   Outdoor
2.   Transportation



# Transformada de Fourier de Curto Termo



hop_length : int > 0 [scalar] - Usamos 1024

number of audio samples between adjacent STFT columns.



In [0]:
def stft(signal):
    S, phase = librosa.magphase(np.abs(librosa.stft(signal, hop_length=1024)))
    return S

signals_stft = []
for s in signals:
    signals_stft.append(stft(s))

signals_stft = np.asarray(signals_stft)
signals_stft.shape

# Getting Features

1.   Centroid Mean
2.   Centroid STD
3.   Flatness Mean
4.   Flatness STD


In [0]:
def get_features(signals_stft, rotulos):
    def get_centroid(S):
        return librosa.feature.spectral_centroid(S=S)
    def get_flatness(S):
        return librosa.feature.spectral_flatness(S=S)
    def get_rms(s):
        return librosa.feature.rms(s, hop_length=1024)

    info = {'Centroid Mean':[], 
        'Centroid STD': [], 
        'Flatness Mean':[],
        'Flatness STD':[],
        'RMS':[],
        'Target': []} 

    
    for s, rotulo in zip(signals_stft, rotulos):
      
        info['Target'].append(rotulo)
        
        '''
            Obtendo centroide, flatness e RMS
        '''
        c = get_centroid(s)
        c = c[0]
        info['Centroid Mean'].append(np.mean(c))
        info['Centroid STD'].append(np.std(c))
        
        
        f = get_flatness(s)
        f = f[0]
        info['Flatness Mean'].append(np.mean(f))
        info['Flatness STD'].append(np.std(f))
        
        r = get_rms(s)
        r = round(r[0][0],3)
        info['RMS'].append(r)
        
        
        
    return pd.DataFrame(info)

In [0]:
df = get_features(signals_stft, rotulos)
df

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scaler = StandardScaler()

X,y = df.iloc[:,0:-1], df['Target'].values
scaler.fit(X)
X = scaler.transform(X)

In [0]:
# splitting the data into training and test sets (80:20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Building a KNN Model

In [0]:
def knn_classifier(X_train, X_test, y_train, k):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    return [y_pred, metrics.accuracy_score(y_test,y_pred)]

def get_k_accuracy(X_train, X_test, y_train, y_test): 

    def plot_scores(x,y):
        fig, a = plt.subplots(1, figsize = (10, 8))
        title = "Accuracy Score by K values"
        plt.title(title)
        plt.xlabel('Value of K for KNN')
        plt.ylabel('Testing Accuracy')
        a.plot(x, y)

    k_range = range(1,31)
    scores = {}
    scores_list = []

    for k in k_range:
        y_pred, scores[k] = knn_classifier(X_train, X_test, y_train, k)
        scores_list.append(scores[k])

    plot_scores(list(k_range), scores_list)
    print('Accuracy: ' + str(scores[max(scores, key= scores.get)]))
    return max(scores, key= scores.get)



In [0]:
k = get_k_accuracy(X_train,X_test, y_train, y_test)
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

In [0]:
from sklearn.decomposition import PCA
import seaborn as sns

pca1 = PCA(2)
trans_pca1 = pca1.fit_transform(X_test)
trans_pca1 = pd.DataFrame(trans_pca1)
trans_pca1['Labels'] = [l for y in y_test for i, l in enumerate(labels) if y==i]
trans_pca1.columns = ['PC1', 'PC2', 'Labels']
trans_pca1.head()
plt.figure(figsize=(10,6))
sns.scatterplot(x=trans_pca1['PC1'], y=trans_pca1['PC2'], hue=trans_pca1['Labels'])