In [2]:
import yaml
import pandas as pd
import glob
import os

In [3]:
def create_dataframe(
    path: str,
    year: str,
) -> pd.DataFrame:
    paths = glob.glob(f"{path}/**/*.ogg")
    return pd.DataFrame(
        data={
            "x": paths,
            "species": [x.split("/")[-2] for x in paths],
            "year": year,
            "file_name": [x.split("/")[-1] for x in paths],
        }
    )

In [4]:
df = pd.DataFrame()
birdclefs = {
    "2024": "../../data/birdclef-2024/train_audio",
    "2023": "../../data/birdclef-2023/train_audio",
    "2022": "../../data/birdclef-2022/train_audio",
    "2021": "../../data/birdclef-2021/train_short_audio",
    "extended": "../../data/xeno-canto-extended",
}

In [13]:
for year in birdclefs:
    path = birdclefs[year]
    new_df = create_dataframe(path, year)
    df = pd.concat([df, new_df]).reset_index(drop=True)

df["species"] = df["species"].astype("category")
# df["y"] = None
# df = pd.merge(df, mapper, on="species", how="left")

df["y"] = df["species"].cat.codes

In [14]:
# delete ../ from x
df["x"] = df["x"].apply(lambda x: x.replace("../", ""))

In [15]:
witout_duplicates = df[~df.duplicated(subset=["species", "file_name"], keep="first")]

In [16]:
df["species"] = df["species"].astype("category")

df["y"] = df["species"].cat.codes

In [17]:
witout_duplicates.to_csv("../../data/processed/train_df.csv", index=False)

In [7]:
fine_tune_df = create_dataframe(birdclefs["2024"], "2024")
fine_tune_df["species"] = fine_tune_df["species"].astype("category")

fine_tune_df["y"] = fine_tune_df["species"].cat.codes
fine_tune_df["x"] = fine_tune_df["x"].apply(lambda x: x.replace("../", ""))
fine_tune_df.to_csv("../../data/processed/fine_tune_df.csv", index=False)

In [75]:
df[df["dubliated"]]

Unnamed: 0,x,species,year,file_name,y,dubliated
24896,../../data/birdclef-2023/train_audio/eaywag1/X...,eaywag1,2023,XC645111.ogg,314,True
24902,../../data/birdclef-2023/train_audio/eaywag1/X...,eaywag1,2023,XC507665.ogg,314,True
24911,../../data/birdclef-2023/train_audio/eaywag1/X...,eaywag1,2023,XC675509.ogg,314,True
24913,../../data/birdclef-2023/train_audio/eaywag1/X...,eaywag1,2023,XC373755.ogg,314,True
24915,../../data/birdclef-2023/train_audio/eaywag1/X...,eaywag1,2023,XC738300.ogg,314,True
...,...,...,...,...,...,...
142901,../../data/xeno-canto-extended/chispa/XC137480...,chispa,extended,XC137480.ogg,231,True
142902,../../data/xeno-canto-extended/chispa/XC149900...,chispa,extended,XC149900.ogg,231,True
142903,../../data/xeno-canto-extended/chispa/XC571801...,chispa,extended,XC571801.ogg,231,True
142906,../../data/xeno-canto-extended/chispa/XC142506...,chispa,extended,XC142506.ogg,231,True


In [13]:
df.to_parquet("../../data/processed/files.parquet")

## Checking if we have only one example of some class

In [83]:
class_counts = df["species"].value_counts()
invalid_classes = class_counts[class_counts == 1]
df[df["species"].isin(invalid_classes.index.to_list())]

Unnamed: 0,x,species,dataset,y
24792,data/birdclef-2023/train_audio/lotcor1/XC31723...,lotcor1,2023,493
27371,data/birdclef-2023/train_audio/brtcha1/XC12812...,brtcha1,2023,155
28499,data/birdclef-2023/train_audio/crefra2/XC66744...,crefra2,2023,262
28855,data/birdclef-2023/train_audio/whctur2/XC44463...,whctur2,2023,856
31725,data/birdclef-2023/train_audio/yebsto1/XC33785...,yebsto1,2023,904
34849,data/birdclef-2023/train_audio/whhsaw1/XC28926...,whhsaw1,2023,861
38206,data/birdclef-2023/train_audio/afpkin1/XC70486...,afpkin1,2023,13
48751,data/birdclef-2022/train_audio/maupar/XC123887...,maupar,2022,519
