In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import pandas as pd
import swifter
import librosa
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval

parent = os.path.abspath(os.path.join('..'))
sys.path.append(parent)

import data.misc as misc

  from .autonotebook import tqdm as notebook_tqdm


Get metadatas

In [2]:
class_df = pd.read_csv(
    f"http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv",
    sep=",",
)

eval_df = pd.read_csv(
    "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv",
    sep=", ",
    skiprows=3,
    header=None,
    names=["YTID", "start_seconds", "end_seconds", "positive_labels"],
    engine="python",
)

balanced_train_df = pd.read_csv(
    "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv",
    sep=", ",
    skiprows=3,
    header=None,
    names=["YTID", "start_seconds", "end_seconds", "positive_labels"],
    engine="python",
)

unbalanced_train_df = pd.read_csv(
    "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv",
    sep=", ",
    skiprows=3,
    header=None,
    names=["YTID", "start_seconds", "end_seconds", "positive_labels"],
    engine="python",
)

In [3]:
eval_df["positive_labels"] = eval_df["positive_labels"].apply(literal_eval)
balanced_train_df["positive_labels"] = balanced_train_df["positive_labels"].apply(literal_eval)
unbalanced_train_df["positive_labels"] = unbalanced_train_df["positive_labels"].apply(literal_eval)

In [4]:
def _extract_class_info(row):
    positive_labels = row["positive_labels"].split(",")
    labels_index, labels_name = list(), list()
    for label_code in positive_labels:
        label_index = class_code_to_index.index(label_code)
        label_name = class_code_to_name[label_index]
        labels_index.append(label_index)
        labels_name.append(label_name)

    return pd.Series([labels_index, labels_name], index=['labels_index', 'labels_name'])


class_code_to_index = list(class_df["mid"].values)
class_code_to_name = list(class_df["display_name"].values)

eval_df_ = eval_df.apply(_extract_class_info, axis=1)
eval_df = pd.concat([eval_df, eval_df_], axis=1)

balanced_train_df_ = balanced_train_df.apply(_extract_class_info, axis=1)
balanced_train_df = pd.concat([balanced_train_df, balanced_train_df_], axis=1)

unbalanced_train_df_ = unbalanced_train_df.apply(_extract_class_info, axis=1)
unbalanced_train_df = pd.concat([unbalanced_train_df, unbalanced_train_df_], axis=1)

metadata_path = "/storage11/datasets/audioset/metadata/"
misc.save_df(class_df, os.path.join(metadata_path, "class_labels_indices.csv"))
misc.save_df(eval_df, os.path.join(metadata_path, "eval_segments.csv"))
misc.save_df(balanced_train_df, os.path.join(metadata_path, "balanced_train_segments.csv"))
misc.save_df(unbalanced_train_df, os.path.join(metadata_path, "unbalanced_train_segments.csv"))

In [5]:
eval_df.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,labels_index,labels_name
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk","[73, 361, 74, 72]","[Domestic animals, pets, Squeak, Dog, Animal]"
1,--BfvyPmVMo,20.0,30.0,/m/03l9g,[419],[Hammer]
2,--U7joUcTCo,0.0,10.0,/m/01b_21,[47],[Cough]
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005","[137, 0, 33, 34]","[Music, Speech, Female singing, Child singing]"
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001","[21, 20, 17]","[Chuckle, chortle, Belly laugh, Baby laughter]"


Prepare dataframes for training

In [6]:
CLASSES = list(range(527))

from concurrent.futures import ThreadPoolExecutor, as_completed


def process_audio_file(path):
    try:
        audio, _ = librosa.load(path, sr=None, mono=True)
        if audio.std() > 0:
            return path
        else:
            print(f"Removing {path} due to zero standard deviation.")
    except Exception as e:
        print(f"Removing {path} due to error: {e}")
    os.remove(path)
    return None


def clean_audio_files(audio_paths, max_workers=32):
    valid_audio_paths = list()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_path = {executor.submit(process_audio_file, path): path for path in audio_paths}
        for future in as_completed(future_to_path):
            result = future.result()
            if result:
                valid_audio_paths.append(result)

    return valid_audio_paths


def extend_multi_label_to_one_hot(df, column_name):
    print(" Extending multi label to one hot")
    mlb = MultiLabelBinarizer(classes=CLASSES)
    mlb.fit(df[column_name])
    one_hot_encoded = mlb.transform(df[column_name])
    one_hot_df = pd.DataFrame(data=one_hot_encoded, columns=mlb.classes_)
    df = pd.concat([df, one_hot_df], axis=1)

    return df


def clean_corrupted(root_path):
    print(" Cleaning corrupted files...")
    audios = misc.get_file_list(root_path)
    _ = clean_audio_files(audios)


def build_database_df(df, root_path, ext=".wav", chunk_size=50000):
    def _extract_audio_info(row):
        audio_id = row["YTID"]
        path = os.path.join(root_path, audio_id + ext)
        relative = path.replace(root_path, "")
        basename = os.path.basename(path)
        length_seconds = row["end_seconds"] - row["start_seconds"]
        exists = os.path.exists(path)

        return pd.Series(
            [
                path,
                relative,
                basename,
                length_seconds,
                exists,
            ],
            index=[
                "path",
                "relative",
                "basename",
                "length_seconds",
                "exists",
            ],
        )

    if df.shape[0] < chunk_size:
        df_ = df.swifter.apply(_extract_audio_info, axis=1)
        df = pd.concat([df, df_], axis=1)
    else:
        chunks = list()
        for start in range(0, df.shape[0], chunk_size):
            end = start + chunk_size
            df_slice = df.iloc[start:end]
            df_slice_ = df_slice.swifter.apply(_extract_audio_info, axis=1)
            df_slice = pd.concat([df_slice, df_slice_], axis=1)
            chunks.append(df_slice)
        df = pd.concat(chunks, ignore_index=True)

    df = df[df["exists"] == True].reset_index(drop=True)
    return df


print("Eval-dataframe...")
root_path = "/storage11/datasets/audioset/audios/eval/"
clean_corrupted(root_path)
records_eval_df = build_database_df(eval_df, root_path)
misc.save_df(records_eval_df, os.path.join(root_path, "records.pkl"))
records_eval_df = extend_multi_label_to_one_hot(records_eval_df, "labels_index")
misc.save_df(records_eval_df, os.path.join(root_path, "records-hot1.pkl"))


print("Balanced-dataframe...")
clean_corrupted(root_path)
root_path = "/storage11/datasets/audioset/audios/balanced_train/"
records_balanced_train_df = build_database_df(balanced_train_df, root_path)
misc.save_df(records_balanced_train_df, os.path.join(root_path, "records.pkl"))
records_balanced_train_df = extend_multi_label_to_one_hot(records_balanced_train_df, "labels_index")
misc.save_df(records_balanced_train_df, os.path.join(root_path, "records-hot1.pkl"))


print("Unbalanced-dataframe...")
clean_corrupted(root_path)
root_path = "/storage11/datasets/audioset/audios/unbalanced_train/"
records_unbalanced_train_df = build_database_df(unbalanced_train_df, root_path)
misc.save_df(records_unbalanced_train_df, os.path.join(root_path, "records.pkl"))
records_unbalanced_train_df = extend_multi_label_to_one_hot(
    records_unbalanced_train_df, "labels_index"
)
misc.save_df(records_unbalanced_train_df, os.path.join(root_path, "records-hot1.pkl"))

Eval-dataframe...
 Cleaning corrupted files...


Pandas Apply: 100%|██████████| 20371/20371 [00:04<00:00, 4192.78it/s]


 Extending multi label to one hot
Balanced-dataframe...
 Cleaning corrupted files...


Pandas Apply: 100%|██████████| 22160/22160 [00:05<00:00, 3995.24it/s]


 Extending multi label to one hot
Unbalanced-dataframe...
 Cleaning corrupted files...


Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 3929.42it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 4004.67it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 4019.82it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 3976.68it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:16<00:00, 3040.59it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:15<00:00, 3242.58it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:15<00:00, 3229.22it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 4033.79it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:15<00:00, 3277.05it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:18<00:00, 2705.65it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:12<00:00, 3993.05it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:15<00:00, 3266.09it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:14<00:00, 3341.44it/s]
Pandas Apply: 100%|██████████| 50000/50000 [00:14<00:00, 3349.28it/s]
Pandas Apply: 100%|█

 Extending multi label to one hot


In [7]:
records_balanced_train_df["source"] = "balanced_train"
records_unbalanced_train_df["source"] = "unbalanced_train"
df = pd.concat([records_balanced_train_df, records_unbalanced_train_df], ignore_index=True)
df = df.drop(["relative"], axis=1)

In [8]:
def _fix_relative(row):
    return row["path"].replace(root_path, "")


root_path = "/storage11/datasets/audioset/audios/"
df["relative"] = df.apply(_fix_relative, axis=1)
misc.save_df(df, os.path.join(root_path, "train-full-records-hot1.pkl"))

In [9]:
df.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,labels_index,labels_name,path,basename,length_seconds,exists,...,519,520,521,522,523,524,525,526,source,relative
0,--PJHxphWEs,30.0,40.0,"/m/09x0r,/t/dd00088","[0, 451]","[Speech, Gush]",/storage11/datasets/audioset/audios/balanced_t...,--PJHxphWEs.wav,10.0,True,...,0,0,0,0,0,0,0,0,balanced_train,balanced_train/--PJHxphWEs.wav
1,--aE2O5G5WE,0.0,10.0,"/m/03fwl,/m/04rlf,/m/09x0r","[95, 137, 0]","[Goat, Music, Speech]",/storage11/datasets/audioset/audios/balanced_t...,--aE2O5G5WE.wav,10.0,True,...,0,0,0,0,0,0,0,0,balanced_train,balanced_train/--aE2O5G5WE.wav
2,--aaILOrkII,200.0,210.0,"/m/032s66,/m/073cg4","[427, 431]","[Gunshot, gunfire, Cap gun]",/storage11/datasets/audioset/audios/balanced_t...,--aaILOrkII.wav,10.0,True,...,0,0,0,0,0,0,0,0,balanced_train,balanced_train/--aaILOrkII.wav
3,--ekDLDTUXA,30.0,40.0,"/m/015lz1,/m/07pws3f","[27, 466]","[Singing, Bang]",/storage11/datasets/audioset/audios/balanced_t...,--ekDLDTUXA.wav,10.0,True,...,0,0,0,0,0,0,0,0,balanced_train,balanced_train/--ekDLDTUXA.wav
4,-0DLPzsiXXE,30.0,40.0,"/m/04rlf,/m/07qwdck","[137, 482]","[Music, Ping]",/storage11/datasets/audioset/audios/balanced_t...,-0DLPzsiXXE.wav,10.0,True,...,0,0,0,0,0,0,0,0,balanced_train,balanced_train/-0DLPzsiXXE.wav
