#  Import CSV data

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil

cwd = Path.cwd()
project_root = cwd.parents[1]  # Assuming the project root is two levels up from the current working directory

output_metadata_path = project_root / 'Output_metadata' / 'GreatTit_metadata' / 'final_greatTit_metadata.csv'
final_data = pd.read_csv(output_metadata_path) 

final_data.rename(columns={final_data.columns[0]: "filename"}, inplace=True)
final_data["filename"] = final_data["filename"].astype(str) + ".wav"

print(final_data.head())

                                  filename     class_id         ID   father  \
0  20201B161_20200418_040000_130451999.wav  20201B161_1  20201B161  te80930   
1   20201B161_20200418_050000_11502511.wav  20201B161_1  20201B161  te80930   
2   20201B161_20200418_050000_13585746.wav  20201B161_1  20201B161  te80930   
3   20201B161_20200418_050000_13982440.wav  20201B161_1  20201B161  te80930   
4   20201B161_20200418_050000_20011659.wav  20201B161_1  20201B161  te80930   

          0         1         2         3         4         5  ...       374  \
0  0.009118 -0.039894  0.019366  0.012394  0.025821  0.041465  ... -0.087559   
1  0.015071 -0.094309  0.012136 -0.006451 -0.007025  0.044730  ... -0.089846   
2  0.012520 -0.037601  0.030848  0.005622  0.046687  0.054802  ... -0.102847   
3  0.017553 -0.055216  0.018768 -0.007323  0.017101  0.049850  ... -0.083391   
4  0.033305 -0.048789  0.009809  0.000716  0.016080  0.029543  ... -0.081079   

        375       376       377       378   

### Create 3 .csv files with the list of train, test, and val, including the original embeddings as well.

In [2]:
# Assuming cwd is in Notebooks/Preprocess_greatTit
project_root = Path.cwd().parents[1]              # → /teamspace/studios/this_studio
output_dir   = project_root / 'Output_metadata' / 'GreatTit_metadata'         # → …/output
output_dir.mkdir(parents=True, exist_ok=True)

# --- 1) Prepare containers and hyperparameters ---
random_state = 42
train_idx, val_idx, test_idx = [], [], []

# --- 2) Split by individual (father) ---
for indiv, group in final_data.groupby("father", sort=True):
    idx = group.index

    # 80% train, 20% temp
    train, temp = train_test_split(
        idx, test_size=0.2, random_state=random_state
    )
    # 10% val, 10% test
    val, test = train_test_split(
        temp, test_size=0.5, random_state=random_state
    )

    train_idx.extend(train)
    val_idx.extend(val)
    test_idx.extend(test)

# --- 3) Create the final DataFrames ---
train_df = final_data.loc[train_idx].reset_index(drop=True)
val_df   = final_data.loc[val_idx].reset_index(drop=True)
test_df  = final_data.loc[test_idx].reset_index(drop=True)

# --- 4) Save the metadata using pathlib paths ---
train_df.to_csv(output_dir / "train_metadata.csv", index=False)
val_df.to_csv(  output_dir / "val_metadata.csv",   index=False)
test_df.to_csv( output_dir / "test_metadata.csv",  index=False)

# --- 5) Verify sizes ---
print(f"Train: {len(train_df)}  •  Val: {len(val_df)}  •  Test: {len(test_df)}")


Train: 59139  •  Val: 7396  •  Test: 7513


### Extract the name of each individual and count how many audio files each one has

In [3]:
folders = final_data['father'].astype(str)
unique_folders = folders.nunique()
print(f"Number of individuals: {unique_folders}")

folder_counts = folders.value_counts()
print(folder_counts)


Number of individuals: 242
father
vz33280    2389
tv44361    2364
ty69215    1367
vz31728    1124
vz31410    1092
           ... 
te80791      16
vz29148      16
tw41650      15
te80832      15
vz32685      15
Name: count, Length: 242, dtype: int64


### This code Creates a base directory and subdirectories and then moves files based on metadata:
  
it:
  - Extracts the file name and the corresponding subfolder (indicated by the "father" field).
  - Ensures the destination subfolder exists.
  - Checks if the file exists in the base folder.
  - Moves the file from the base folder to its corresponding subfolder within the train, test, or validation directory, logging a message if the file isn't found.



In [None]:
base_folder = project_root / 'Original_datasets' / 'greatTit_song-files'
base_folder.mkdir(parents=True, exist_ok=True)  # Ensure base_folder exists
train_dir = base_folder / "train"
test_dir = base_folder / "test"
val_dir = base_folder / "val"

train_dir.mkdir(parents=True, exist_ok=True)
test_dir.mkdir(parents=True, exist_ok=True)
val_dir.mkdir(parents=True, exist_ok=True)

def move_files(df, dest_dir):
    for _, row in df.iterrows():
        audio_name = row.iloc[0]  
        father = row['father']   
        file_name = audio_name
        src_path = base_folder / file_name
        subfolder = dest_dir / father
        subfolder.mkdir(exist_ok=True)
        dest_path = subfolder / file_name
        if src_path.exists():
            shutil.move(str(src_path), str(dest_path))          
        else:
            print(f"❌ File not found: {src_path}")

# Move files
move_files(train_df, train_dir)
move_files(test_df, test_dir)
move_files(val_df, val_dir)