In [65]:
import yaml
import pandas as pd
import glob
import os

In [66]:
with open("../../filepaths.yml", "r") as file:
    filepaths = yaml.safe_load(file)

In [67]:
def get_classes(path: str) -> list[str]:
    class_names = sorted(os.listdir(path))
    return class_names

In [68]:
# create df with x = path to audio file, y = class
def create_dataframe(path: str, datasetName: str) -> pd.DataFrame:
    path = f"../../{path}"
    paths = glob.glob(f"{path}/**/*.ogg")
    paths = [x.replace("../../", "") for x in paths]

    df = pd.DataFrame(
        data={
            "x": paths,
            "species": [x.split("/")[-2] for x in paths],
            "dataset": datasetName,
        }
    )
    return df

In [69]:
df = pd.DataFrame()

for source in filepaths["birdclefs"]:
    path = filepaths["birdclefs"][source]
    new_df = create_dataframe(path, source)
    df = pd.concat([df, new_df])

df["species"] = df["species"].astype("category")
df["y"] = df["species"].cat.codes

In [70]:
# make sure that the y values are unique for each class
assert len(df["species"].unique()) == len(df["y"].unique())

In [71]:
df = df.reset_index(drop=True)

In [72]:
# get number of classes
n_classes = len(df["species"].unique())

In [73]:
len(df)

119126

In [74]:
df.head()

Unnamed: 0,x,species,dataset,y
0,data/birdclef-2024/train_audio/rutfly6/XC52430...,rutfly6,2024,700
1,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,2024,700
2,data/birdclef-2024/train_audio/rutfly6/XC81798...,rutfly6,2024,700
3,data/birdclef-2024/train_audio/rutfly6/XC30761...,rutfly6,2024,700
4,data/birdclef-2024/train_audio/rutfly6/XC30751...,rutfly6,2024,700


In [75]:
df["species"].value_counts()

species
comsan     1500
barswa     1500
houspa     1500
bcnher     1296
eaywag1    1000
           ... 
yebsto1       1
crefra2       1
afpkin1       1
whctur2       1
lotcor1       1
Name: count, Length: 926, dtype: int64

In [13]:
df.to_parquet("../../data/processed/files.parquet")

## Checking if we have only one example of some class

In [83]:
class_counts = df["species"].value_counts()
invalid_classes = class_counts[class_counts == 1]
df[df["species"].isin(invalid_classes.index.to_list())]

Unnamed: 0,x,species,dataset,y
24792,data/birdclef-2023/train_audio/lotcor1/XC31723...,lotcor1,2023,493
27371,data/birdclef-2023/train_audio/brtcha1/XC12812...,brtcha1,2023,155
28499,data/birdclef-2023/train_audio/crefra2/XC66744...,crefra2,2023,262
28855,data/birdclef-2023/train_audio/whctur2/XC44463...,whctur2,2023,856
31725,data/birdclef-2023/train_audio/yebsto1/XC33785...,yebsto1,2023,904
34849,data/birdclef-2023/train_audio/whhsaw1/XC28926...,whhsaw1,2023,861
38206,data/birdclef-2023/train_audio/afpkin1/XC70486...,afpkin1,2023,13
48751,data/birdclef-2022/train_audio/maupar/XC123887...,maupar,2022,519
