# 1. File Loading

In [1]:
import os, glob
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import sklearn.preprocessing
import multiprocessing
from functools import partial

def load_file(path_dir):
    files = os.listdir(path_dir)
    return [file for file in files if file.endswith(".wav")]

carhorn_path_dir = "./audio/car_horn/"
speech_path_dir = "./audio/Malespeech,manspeaking/"
music_path_dir = "./audio/Electronicmusic/"

carhorn_file_list = []
speech_file_list = []
music_file_list = []

for name in load_file(carhorn_path_dir):
    carhorn_file_list.append(carhorn_path_dir+name)

for name in load_file(speech_path_dir):
    speech_file_list.append(speech_path_dir+name)

for name in load_file(music_path_dir):
    music_file_list.append(music_path_dir+name)

print("file loading done")

file loading done


# 2. MFCC

In [2]:
def mfcc(file):
    mfcc_list=[]
    audio, sr = librosa.load(file, sr=None)
    hop_length = 256
    n_fft = 1024
    n_mfcc = 20
    mfcc = librosa.feature.mfcc(audio, sr=44100, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfcc = sklearn.preprocessing.scale(mfcc, axis=1)
    pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))
    mfcc_list.append(pad2d(mfcc, 100))
    return mfcc_list

pool = multiprocessing.Pool(processes = 100)
temp1 = pool.map(mfcc, carhorn_file_list)
pool.close()
pool.join()
carhorn_list = []
for i in range(len(temp1)):
    carhorn_list.append(temp1[i][0])
print("make carhorn_list done")

pool = multiprocessing.Pool(processes = 100)
temp2 = pool.map(mfcc, speech_file_list)
pool.close()
pool.join()
speech_list = []
for i in range(len(temp2)):
    speech_list.append(temp2[i][0])
print("make speech_list done")

pool = multiprocessing.Pool(processes = 100)
temp3 = pool.map(mfcc, music_file_list)
pool.close()
pool.join()
music_list = []
for i in range(len(temp3)):
    music_list.append(temp3[i][0])
print("make music_list done")

print("MFCC done")

make carhorn_list done
make speech_list done
make music_list done
MFCC done


# 3.Making Dataset

In [3]:
def reshape_data(data):
    return list(np.ravel(data))

def label_data(label, data):
    return [label]+data

flat_speech_data = []
for data in speech_list:
    flat_speech_data.append(label_data(0, reshape_data(data)))

flat_carhorn_data = []
for data in carhorn_list:
    flat_carhorn_data.append(label_data(1, reshape_data(data)))

flat_music_data = []
for data in music_list:
    flat_music_data.append(label_data(2, reshape_data(data)))

print("labeling & reshaping done")
# for i in range(len(flat_speech_data)):
#     flat_speech_data[i].insert(0, 0)

# labeled_carhorn_list = []
# for data in carhorn_list:
#     labeled_carhorn_list.append(labeling(1,data))

# temp = []
# pool = multiprocessing.Pool(processes = 100)
# labeling_carhorn(labeling, label=1)
# temp.append(pool.map(labeling_carhorn, carhorn_list))
# pool.close()
# pool.join()
# print("labeling done")

labeling & reshaping done


# 4. Write test and training set in .csv

In [4]:
import csv

print("writing speech_mfcc.csv start")

with open('speech_mfcc.csv', 'w', newline='') as sf:
    writer = csv.writer(sf)
    for data in flat_speech_data:
        writer.writerow(data)

print("writing speech_mfcc.csv finish")

print("writing carhorn_mfcc.csv start")

with open('carhorn_mfcc.csv', 'w', newline='') as cf:
    writer = csv.writer(cf)
    for data in flat_carhorn_data:
        writer.writerow(data)

print("writing carhorn_mfcc.csv finish")

print("writing music_mfcc.csv start")

with open('music_mfcc.csv', 'w', newline='') as cf:
    writer = csv.writer(cf)
    for data in flat_music_data:
        writer.writerow(data)

print("writing music_mfcc.csv finish")

writing speech_mfcc.csv start
writing speech_mfcc.csv finish
writing carhorn_mfcc.csv start
writing carhorn_mfcc.csv finish
writing music_mfcc.csv start
writing music_mfcc.csv finish


In [5]:
import random
dataset = flat_carhorn_data+flat_speech_data+flat_music_data
random.shuffle(dataset)

print("writing first_dataset.csv start")

with open('20*100_dataset.csv', 'w', newline='') as df:
    writer = csv.writer(df)
    for data in dataset:
        writer.writerow(data)

print("writing first_dataset.csv finish")

writing first_dataset.csv start
writing first_dataset.csv finish


In [6]:
print("writing train_set4.csv start")
with open('train_set4.csv', 'w', newline='') as trainf:
    writer = csv.writer(trainf)
    for i in range(0, int(len(dataset)*8/10)):
        writer.writerow(dataset[i])
print("writing train_set4.csv finish")

print("writing test_set4.csv start")
with open('test_set4.csv', 'w', newline='') as testf:
    writer = csv.writer(testf)
    for i in range(int(len(dataset)*8/10), len(dataset)):
        writer.writerow(dataset[i])
print("writing test_set4.csv finish")

writing train_set4.csv start
writing train_set4.csv finish
writing test_set4.csv start
writing test_set4.csv finish
