In [24]:
import yaml
import pandas as pd
import glob
import os

In [25]:
with open("../../filepaths.yml", "r") as file:
    filepaths = yaml.safe_load(file)

In [26]:
def get_classes(path: str) -> list[str]:
    class_names = sorted(os.listdir(path))
    return class_names

In [27]:
# create df with x = path to audio file, y = class
def create_dataframe(path: str) -> pd.DataFrame:
    path = f"../../{path}"
    class_names = get_classes(path)
    class_names_dic = {class_names[i]: i for i in range(0, len(class_names))}
    paths = glob.glob(f"{path}/**/*.ogg")
    paths = [x.replace("../../", "") for x in paths]

    df = pd.DataFrame(data={"x": paths, "species": [x.split("/")[-2] for x in paths]})
    df["species"] = df["species"].astype("category")
    df["y"] = df["species"].map(class_names_dic)
    return df

In [28]:
df = pd.DataFrame()

for source in filepaths["birdclefs"]:
    path = filepaths["birdclefs"][source]
    new_df = create_dataframe(path)
    df = pd.concat([df, new_df])

In [29]:
df = df.reset_index(drop=True)

In [31]:
df

Unnamed: 0,x,species,y
0,data/birdclef-2024/train_audio/rutfly6/XC52430...,rutfly6,148
1,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,148
2,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,148
3,data/birdclef-2024/train_audio/rutfly6/XC30761...,rutfly6,148
4,data/birdclef-2024/train_audio/rutfly6/XC30751...,rutfly6,148
...,...,...,...
119121,data/birdclef-2021/train_short_audio/chispa/XC...,chispa,95
119122,data/birdclef-2021/train_short_audio/chispa/XC...,chispa,95
119123,data/birdclef-2021/train_short_audio/chispa/XC...,chispa,95
119124,data/birdclef-2021/train_short_audio/chispa/XC...,chispa,95


In [30]:
len(df)

119126

In [8]:
df.head()

Unnamed: 0,x,species,y
0,data/birdclef-2024/train_audio/rutfly6/XC52430...,rutfly6,148
1,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,148
2,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,148
3,data/birdclef-2024/train_audio/rutfly6/XC30761...,rutfly6,148
4,data/birdclef-2024/train_audio/rutfly6/XC30751...,rutfly6,148


In [9]:
df["species"].value_counts()

species
barswa     1500
comsan     1500
houspa     1500
bcnher     1296
norcar     1000
           ... 
yebsto1       1
afpkin1       1
whctur2       1
maupar        1
lotcor1       1
Name: count, Length: 926, dtype: int64

In [13]:
df.to_parquet("../../data/processed/files.parquet")