# **Imports**

In [1]:
from google.colab import drive
import zipfile
import os
import pandas as pd
import numpy as np
import ast
import random
import librosa
import tensorflow as tf
import joblib
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Main_Birdclef/scripts')
import birdclef_utils

birdclef_utils.retrieve_and_process_birdclef_data()
birdclef_utils.retrieve_and_process_birdclef_data(zip_filename='ColabUploads.zip')

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully extracted all files from birdclef-2025.zip to /content/data
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully extracted all files from ColabUploads.zip to /content/data


# **Summary**

This notebook generates pseudo labels for bird species classes **not covered** by the BirdNET Analyzer or the Google Bird Vocalization Classifier.

For the train audio recordings, we use a custom model trained with **MobileNet weights** and a **softmax output** over 206 classes. This model predicts the primary assigned bird label, which is assumed to be the most prevalent species in the recording.

For secondary labels, we employ **individual binary classifiers** specifically trained to detect the presence or absence of a particular bird species. These binary models are also based on MobileNet architectures pretrained on ImageNet. This approach is useful because these models are optimized to recognize a single bird species even in recordings with overlapping vocalizations from multiple species.

This two-tiered modeling approach improves pseudo label coverage and accuracy beyond what is achievable with BirdNET and the Google classifier alone.


# **Directories**

In [6]:
main_dir='/content/data/'
main_processed_dir=os.path.join(main_dir,'ColabUploads')
processed_dir=os.path.join(main_processed_dir,'KaggleUploads')

drive_dir='/content/drive/MyDrive'
main_birdclef_dir=os.path.join(drive_dir,'Main_Birdclef')
csv_dir=os.path.join(main_birdclef_dir,'CSVs')
supplemental_files_dir=os.path.join(main_birdclef_dir,'supplemental_files')
models_dir=os.path.join(main_birdclef_dir,'models')

output_dir=csv_dir

# **Main metadata**

In [3]:
df = pd.read_csv(os.path.join(processed_dir,'speech_cleaned_audio_with_duration.csv'), dtype={'primary_label': 'object'})
df['secondary_labels'] = df['secondary_labels'].apply(ast.literal_eval)
df['isOneBird']=df['secondary_labels'].apply(lambda x: True if len(x)==0 or x[0]=='' else False)
taxonomy=pd.read_csv(os.path.join(main_dir,'taxonomy.csv'))

# **Find Non-Birdnet Species and Filenames**:

This code identifies bird species labels and associated audio files that are **not included in the BirdNET primary training classes**.



In [7]:
train_labels_df=pd.read_csv(os.path.join(csv_dir,'birdnet_train_labels_final.csv'))
all_labels=train_labels_df['primary_label'].unique()
all_labels_in_data=df['primary_label'].unique()
labels_to_predict=[l for l in all_labels_in_data if l not in all_labels]
filenames_to_process=df[df['primary_label'].isin(labels_to_predict)]['filename']
print(f'Number of filenames to processes {len(filenames_to_process)}\n')

# Birdnet Generated predictions
print('Species With minimal high confidence samples')
all_labels=df['primary_label'].unique()
for bird in all_labels:
    check=train_labels_df[train_labels_df['filename'].str.contains(bird)]
    check=check[check['confidence']>0.5]
    if len(check)<25:
        print(f'{bird}:{len(check)}')


Number of filenames to processes 931

Species With minimal high confidence samples
1139490:0
1192948:0
1194042:0
126247:0
1346504:0
134933:0
135045:0
1462711:0
1462737:0
1564122:0
21038:0
21116:0
21211:1
22333:0
22973:0
22976:0
24272:0
24292:0
24322:0
41663:0
41778:0
41970:0
42007:0
42087:0
42113:0
46010:0
47067:4
476537:1
476538:0
48124:0
50186:0
517119:0
523060:0
528041:0
52884:0
548639:0
555086:0
555142:0
566513:0
64862:0
65336:12
65344:0
65349:0
65373:0
65419:2
65448:0
65547:0
65962:0
66016:0
66531:4
66578:0
66893:0
67082:0
67252:0
714022:0
715170:0
787625:0
81930:0
868458:0
963335:0
bafibi1:23
grysee1:6
plctan1:19
shghum1:0
turvul:1


In [8]:
class SlidingWindowInferenceGenerator:
    def __init__(
        self,
        file_list,
        sr=32000,
        chunk_duration=5.0,
        step_duration=1.0,
        n_mels=128,
        target_time_length_spectrogram=313,
        normalize_audio=True,
        background_flag=0,
        audio_dir='train_audio'
    ):
        self.file_list = file_list
        self.sr = sr
        self.chunk_duration = chunk_duration
        self.step_duration = step_duration
        self.n_mels = n_mels
        self.target_time_length_spectrogram = target_time_length_spectrogram
        self.normalize_audio = normalize_audio
        self.background_flag = background_flag
        self.audio_dir = audio_dir

    def _pad_or_truncate_audio(self, data, target_length):
        if data.shape[0] < target_length:
            padding = np.zeros((target_length - data.shape[0],), dtype=data.dtype)
            return np.concatenate((data, padding))
        elif data.shape[0] > target_length:
            return data[:target_length]
        else:
            return data

    def _normalize(self, audio):
        peak = np.abs(audio).max()
        if peak > 0:
            return audio / peak
        return audio

    def _pad_or_truncate(self, data, target_length):
        if data.shape[0] < target_length:
            padding = np.zeros((target_length - data.shape[0], data.shape[1]), dtype=data.dtype)
            return np.concatenate((data, padding), axis=0)
        elif data.shape[0] > target_length:
            return data[:target_length]
        else:
            return data

    def _extract_features(self, audio):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sr, n_mels=self.n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T

        min_db = -80.0
        max_db = 0.0

        mel_spec_scaled = (mel_spec_db - min_db) / (max_db - min_db)
        mel_spec_scaled = np.clip(mel_spec_scaled, 0.0, 1.0)

        # Pad/truncate to fixed length and add a channel axis
        mel_spec_padded = self._pad_or_truncate(mel_spec_scaled, self.target_time_length_spectrogram)[:, :, np.newaxis]
        # Make 3-channel for EfficientNet
        mel_spec_padded_3_channel = np.repeat(mel_spec_padded, 3, axis=-1)
        return mel_spec_padded_3_channel

    def generate(self):
        for filename in self.file_list:
            try:
                audio, _ = librosa.load(os.path.join(self.audio_dir, filename), sr=self.sr)
            except Exception as e:
                print(f"Error loading {filename}: {e}. Skipping.")
                continue
            if self.normalize_audio:
                audio = self._normalize(audio)
            total_samples = len(audio)
            window_length = int(self.chunk_duration * self.sr)
            step_length = int(self.step_duration * self.sr)
            mel_features = []
            start_times = []
            for start in range(0, total_samples - window_length + 1, step_length):
                end = start + window_length
                chunk = audio[start:end]
                if len(chunk) < window_length:
                    chunk = self._pad_or_truncate_audio(chunk, window_length)
                mel = self._extract_features(chunk)
                mel_features.append(mel)
                start_times.append(start / self.sr)
            mel_features = np.array(mel_features)
            # --- Added check: skip files with no valid chunks ---
            if mel_features.shape[0] == 0:
                print(f"Skipping {filename}: no valid chunks")
                continue
            yield mel_features, filename, start_times


# **Train_Audio: Rare (non_birdnet) Species**
- Get Confidences for labels not in birdnet or GBVC

### General Explanation of This Section:

This code uses a pre-trained softmax bird classification model (built from MobileNet weights) to generate prediction confidences specifically for bird species that are **not included in the BirdNET label set but assigned as primary labels in the BirdCLEF dataset**.

- It either runs inference via a sliding window generator on the audio files with these non-BirdNET primary labels or loads previously saved predictions.

- For each audio chunk, the model predicts probabilities over all possible bird classes in its label encoder.

- It extracts the confidence for the primary bird label assigned to that file.

- The results from all files and chunks are consolidated into a DataFrame including filename, start time, predicted confidence, and label metadata.

- The final DataFrame filters for predictions with confidence greater than 0.5 and saves the results for downstream use.

**In essence:**  
This step leverages a specialized softmax model trained to identify the dominant bird in an audio clip, generating high-confidence predictions for bird species missing from BirdNET’s original classes but labeled as primary birds in BirdCLEF data.


In [12]:
rerun = False
primary_label_mapping=df.set_index('filename')['primary_label'].to_dict()
if rerun:
    file_list = filenames_to_process
    audio_dir = os.path.join(main_dir,'train_audio')
    model_path = os.path.join(models_dir, 'best_model_by_val_loss_softmax.keras')
    label_encoder = joblib.load(os.path.join(supplemental_files_dir,'bird_label_encoder.joblib'))
    model = tf.keras.models.load_model(model_path)

    gen = SlidingWindowInferenceGenerator(
        file_list,
        sr=32000,
        chunk_duration=5.0,
        step_duration=1.0,
        n_mels=128,
        target_time_length_spectrogram=320,
        normalize_audio=True,
        background_flag=0,
        audio_dir=audio_dir
    )

    top_n = 3
    results = []

    for mel_features, filename, start_times in gen.generate():
        preds = model.predict(mel_features)
        class_names = label_encoder.classes_
        primary_label = primary_label_mapping.get(filename, None)
        if primary_label not in class_names:
            print(f"{filename}: primary label '{primary_label}' not in class list. Skipping.")
            continue
        class_idx = np.where(class_names == primary_label)[0][0]
        for i, start_time in enumerate(start_times):
            prob = float(preds[i, class_idx])
            row = {
                "filename": filename,
                "start_time": start_time,
                "primary_label": primary_label,
                "prob": prob
            }
            results.append(row)


    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir,'sliding_window_preds_softmax.csv'), index=False)
else:
    results_df = pd.read_csv(os.path.join(output_dir,'sliding_window_preds_softmax.csv'))
print(results_df.head())

to_join=df[['filename','isOneBird','secondary_labels']]
merged=pd.merge(results_df,to_join,on='filename')
merged['end_time']=merged['start_time']+5
merged['confidence']=merged['prob']
merged['scientific_name']=None
final_df=merged[['filename','start_time','end_time','confidence','scientific_name','primary_label']]
final_df=final_df[final_df['confidence']>0.5]
final_df.head(20)

final_df.to_csv(os.path.join(output_dir,'non_birdnet_confidences_from_softmax.csv'),index=False)


               filename  start_time primary_label      prob
0  1139490/CSA36385.ogg         0.0       1139490  0.999727
1  1139490/CSA36385.ogg         1.0       1139490  0.997495
2  1139490/CSA36385.ogg         2.0       1139490  0.998011
3  1139490/CSA36385.ogg         3.0       1139490  0.998734
4  1139490/CSA36385.ogg         4.0       1139490  0.791113


Unnamed: 0,filename,start_time,end_time,confidence,scientific_name,primary_label
0,1139490/CSA36385.ogg,0.0,5.0,0.999727,,1139490
1,1139490/CSA36385.ogg,1.0,6.0,0.997495,,1139490
2,1139490/CSA36385.ogg,2.0,7.0,0.998011,,1139490
3,1139490/CSA36385.ogg,3.0,8.0,0.998734,,1139490
4,1139490/CSA36385.ogg,4.0,9.0,0.791113,,1139490
186,1192948/CSA36358.ogg,0.0,5.0,0.996588,,1192948
187,1192948/CSA36358.ogg,1.0,6.0,0.998313,,1192948
188,1192948/CSA36358.ogg,2.0,7.0,0.998524,,1192948
189,1192948/CSA36358.ogg,3.0,8.0,0.998665,,1192948
190,1192948/CSA36358.ogg,4.0,9.0,0.986442,,1192948


# **Confidences for Secondary_Labels for non_birdnet/gvbc labels**

This code block identifies audio files that contain **secondary labels** which are *not* part of the primary label set used in the BirdNET training data.


### Explanation of Cell 1:

**Cell 1:**  
This cell focuses on identifying audio files where the **primary label belongs to the BirdNET training classes** but the file additionally contains **secondary labels that are *not* among BirdNET’s primary classes**.  

In other words, it finds files where the **main bird species is one that BirdNET recognizes**, but there are other species present (as secondary labels) that BirdNET hasn't explicitly modeled. This highlights cases where BirdNET’s coverage is incomplete for the secondary species.


In [13]:
train_labels_df=pd.read_csv(os.path.join(csv_dir,'birdnet_train_labels_final.csv'))
all_labels=train_labels_df['primary_label'].unique()
all_labels_in_data=df['primary_label'].unique()
labels_to_predict=[l for l in all_labels_in_data if l not in all_labels]
print(labels_to_predict)
data_with_secondary=df[~df['isOneBird']]

# Initialize dictionary to hold filename -> list of unmapped secondary labels
files_with_unmapped_labels_dict = {}

# Iterate over rows in data_with_secondary
for idx, row in data_with_secondary.iterrows():
    secondary_labels = row['secondary_labels']

    # Normalize secondary_labels to a list
    if isinstance(secondary_labels, str):
        # Adjust delimiter if needed; here assuming space-separated labels
        secondary_labels = [label.strip() for label in secondary_labels.split() if label.strip()]
    elif isinstance(secondary_labels, list):
        secondary_labels = [label.strip() for label in secondary_labels if label.strip()]
    else:
        secondary_labels = []

    # Find unmapped secondary labels in this row
    unmapped = [label for label in secondary_labels if label in labels_to_predict]

    # If there are any unmapped labels, add to dictionary
    if unmapped:
        files_with_unmapped_labels_dict[row['filename']] = unmapped

# Now you can check:
print()
print(f"Files with at least one unmapped secondary label: {len(files_with_unmapped_labels_dict)}")
print("Examples (filename: unmapped_secondary_labels):")
for fname, labels in list(files_with_unmapped_labels_dict.items())[:10]:
    print(f"{fname}: {labels}")
filenames_mapping_needed = list(files_with_unmapped_labels_dict.keys())
applicable_data = train_labels_df[train_labels_df['filename'].isin(filenames_mapping_needed)]
new_label_rows = []

for idx, row in applicable_data.iterrows():
    filename = row['filename']
    start_time = row['start_time']
    end_time = row['end_time']
    confidence = 1
    scientific_name = None
    for unmapped_label in files_with_unmapped_labels_dict[filename]:
        primary_label = unmapped_label
        new_label_rows.append({
            'filename': filename,
            'start_time': start_time,
            'end_time': end_time,
            'confidence': confidence,
            'scientific_name': scientific_name,
            'primary_label': primary_label
        })

new_df = pd.DataFrame(new_label_rows)

print(new_df.head())


['1139490', '1192948', '1194042', '126247', '1346504', '134933', '135045', '1462711', '1462737', '1564122', '21038', '21116', '21211', '22333', '22973', '22976', '24272', '24292', '24322', '41663', '41778', '41970', '42007', '42087', '42113', '46010', '47067', '476537', '476538', '48124', '50186', '517119', '523060', '528041', '52884', '548639', '555086', '555142', '566513', '64862', '65336', '65344', '65349', '65373', '65419', '65448', '65547', '65962', '66016', '66531', '66578', '66893', '67082', '67252', '714022', '715170', '787625', '81930', '868458', '963335', 'shghum1']

Files with at least one unmapped secondary label: 7
Examples (filename: unmapped_secondary_labels):
126247/XC941297.ogg: ['65448', '22976', '476538']
476538/XC926710.ogg: ['65448']
65349/XC941283.ogg: ['65547']
65448/XC941294.ogg: ['22976']
bkcdon/XC259283.ogg: ['566513']
butsal1/XC259767.ogg: ['566513']
butsal1/XC259771.ogg: ['566513']
               filename  start_time  end_time  confidence scientific_name  \


### Explanation of cell 2:

This cell collects confidence values for **non-BirdNET secondary labels** in files where the **primary label is also a non-BirdNET label**.

1. It reads an existing CSV (`non_birdnet_confidences_from_softmax.csv`) that presumably contains confidence scores for some non-BirdNET labels.

2. Filters that data to only include files that have unmapped secondary labels (`filenames_mapping_needed`).

3. Iterates over each relevant row, and for each unmapped secondary label linked to the file, creates new rows with:
   - filename,
   - segment start and end times,
   - confidence fixed at 1 (note: this confidence is only temporarily set to 1 as a placeholder),
   - scientific_name set as None,
   - and the unmapped label as `primary_label`.

4. These new rows represent confident pseudo-labels for secondary bird classes not originally covered by BirdNET but present as primary labels in some files.

5. Converts the collected rows into a DataFrame (`new_df_two`).

6. Finally, concatenates `new_df` (from previous cell) and `new_df_two` to form `added_data` which aggregates all unmapped secondary label entries with assigned confidences.

This process is key for enriching the dataset with extended labels beyond BirdNET’s original classes, especially where those labels are primary in files but absent from BirdNET training.



In [14]:
secondary_existing=pd.read_csv(os.path.join(output_dir,'non_birdnet_confidences_from_softmax.csv'))
applicable_data_two=secondary_existing[secondary_existing['filename'].isin(filenames_mapping_needed)]
new_rows=[]
for ind, row in applicable_data_two.iterrows():
    fname=row['filename']
    start_time=row['start_time']
    end_time=row['end_time']
    confidence=1
    scientific_name=None
    for label in files_with_unmapped_labels_dict[fname]:
        primary_label=label
        new_rows.append({'filename':fname,
                          'start_time':start_time,
                          'end_time':end_time,
                          'confidence':confidence,
                          'scientific_name':scientific_name,
                          'primary_label':primary_label})


# After your loop completes and new_rows is populated, create DataFrame like this:
new_df_two = pd.DataFrame(new_rows)

# Now new_df contains all the new rows with the unmapped secondary labels assigned confidence=1
print(new_df_two.head())

added_data=pd.concat([new_df,new_df_two])

              filename  start_time  end_time  confidence scientific_name  \
0  126247/XC941297.ogg         0.0       5.0           1            None   
1  126247/XC941297.ogg         0.0       5.0           1            None   
2  126247/XC941297.ogg         0.0       5.0           1            None   
3  126247/XC941297.ogg         1.0       6.0           1            None   
4  126247/XC941297.ogg         1.0       6.0           1            None   

  primary_label  
0         65448  
1         22976  
2        476538  
3         65448  
4         22976  


### Explanation of cell 3:

This cell obtains actual confidence predictions for the unmapped secondary bird labels by running inference with individual binary classification models trained for each bird species.

1. It iterates through each audio file and its list of unmapped secondary labels (`files_with_unmapped_labels_dict`).

2. For each unmapped bird label:
   - Ensures the directory for that bird's model exists.
   - Loads the bird-specific trained binary classification model (`best_model_<primary_bird>.keras`).
   - Loads the matching label encoder for that bird.
   - Checks that the bird label is recognized by the label encoder.
   
3. Creates a sliding window inference generator for the specific audio file to generate mel spectrogram features chunked over time.

4. Runs the bird-specific model prediction on all chunks of the audio file.

5. Collects the output confidence probabilities (`prob`) for each time window corresponding to the bird label.

6. Appends the predictions (filename, start_time, primary label, and probability) to a consolidated results list.

7. Converts this result list into a DataFrame (`results_df_added_secondary`) containing the inferred confidence scores for the unmapped secondary labels based on their respective specialized models.

---

**Summary:**  
This approach uses per-bird binary classifiers to generate precise confidence scores for secondary bird labels that BirdNET’s primary model does not cover, enabling richer and more accurate multi-label predictions.


In [24]:
all_results = []
def ensure_bird_dir(drive_dir, primary_bird):
    bird_dir = os.path.join(drive_dir, primary_bird)
    if not os.path.exists(bird_dir):
        os.makedirs(bird_dir)
    return bird_dir
for filename, unmapped_labels in files_with_unmapped_labels_dict.items():
    for primary_bird in unmapped_labels:
        bird_dir = ensure_bird_dir(models_dir, primary_bird)
        model_save_path = os.path.join(bird_dir, f'best_model_{primary_bird}.keras')

        if not os.path.exists(model_save_path):
            print(f"Model for bird {primary_bird} not found at {model_save_path}, skipping.")
            continue

        # Load the model for this bird
        model = tf.keras.models.load_model(model_save_path)
        label_encoder = joblib.load(os.path.join(drive_dir, f'bird_label_encoder_{primary_bird}.joblib'))


        class_names = label_encoder.classes_
        if primary_bird not in class_names:
            print(f"Primary bird {primary_bird} not in label encoder classes. Skipping.")
            continue

        class_idx = np.where(class_names == primary_bird)[0]
        audio_dir=os.path.join(main_dir,'train_audio')
        # Create generator for this filename only (assumes SlidingWindowInferenceGenerator can accept specific filenames list)
        gen = SlidingWindowInferenceGenerator(
            [filename],
            sr=32000,
            chunk_duration=5.0,
            step_duration=1.0,
            n_mels=128,
            target_time_length_spectrogram=320,
            normalize_audio=True,
            background_flag=0,
            audio_dir=audio_dir
        )

        for mel_features, fn, start_times in gen.generate():
            preds = model.predict(mel_features)
            for i, start_time in enumerate(start_times):
                prob = float(preds[i, class_idx])
                all_results.append({
                    "filename": fn,
                    "start_time": start_time,
                    "primary_label": primary_bird,
                    "prob": prob
                })

# Convert to DataFrame
results_df_added_secondary = pd.DataFrame(all_results)
print(results_df_added_secondary.head())


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


  prob = float(preds[i, class_idx])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m2s[0m 2s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step
              filename  start_time primary_label      prob
0  126247/XC941297.ogg         0.0         65448  0.639798
1  126247/XC941297.ogg         1.0         65448  0.825361
2  126247/XC941297.ogg         2.0         65448  0.840462
3  126247/XC941297.ogg         3.0         65448  0.771428
4  126247/XC941297.ogg         4.0         65448  0.787996


In [27]:
results_df_added_secondary[results_df_added_secondary['filename']=='butsal1/XC259767.ogg']

Unnamed: 0,filename,start_time,primary_label,prob
98,butsal1/XC259767.ogg,0.0,566513,0.328721
99,butsal1/XC259767.ogg,1.0,566513,0.373549
100,butsal1/XC259767.ogg,2.0,566513,0.589227
101,butsal1/XC259767.ogg,3.0,566513,0.722896
102,butsal1/XC259767.ogg,4.0,566513,0.833401
103,butsal1/XC259767.ogg,5.0,566513,0.934386
104,butsal1/XC259767.ogg,6.0,566513,0.776656
105,butsal1/XC259767.ogg,7.0,566513,0.773868
106,butsal1/XC259767.ogg,8.0,566513,0.455823
107,butsal1/XC259767.ogg,9.0,566513,0.153949


### **Apply the confidences to `addeed_data` and save to csv**

In [30]:
# Create a dictionary mapping keys: (filename, primary_label, start_time) to prob value
confidence_mapping = results_df_added_secondary.set_index(['filename', 'primary_label', 'start_time'])['prob'].to_dict()

# Example safe usage function to assign confidence from this dictionary safely:
def safe_get_confidence(row):
    key = (row['filename'], row['primary_label'], row['start_time'])
    if key in confidence_mapping:
        return confidence_mapping[key]
    else:
        print(f"Key not found: {key}")
        return 0  # or a default confidence

# Apply it on your data (e.g., added_data) to add the confidence column
added_data['confidence'] = added_data.apply(safe_get_confidence, axis=1)

def safe_get_confidence(row):
    key = (row['filename'], row['primary_label'], row['start_time'])
    if key in confidence_mapping:
        return confidence_mapping[key]
    else:
        print(f"Key not found: {key}")
        return np.nan  # or some default
added_data['confidence'] = added_data.apply(safe_get_confidence, axis=1)
added_data.to_csv(os.path.join(output_dir,'added_secondary_labels_non_birdnet.csv'),index=False)
added_data

Key not found: ('butsal1/XC259767.ogg', '566513', 30.0)
Key not found: ('butsal1/XC259767.ogg', '566513', 30.0)


Unnamed: 0,filename,start_time,end_time,confidence,scientific_name,primary_label
0,butsal1/XC259767.ogg,0.0,3.0,0.328721,,566513
1,butsal1/XC259767.ogg,3.0,6.0,0.722896,,566513
2,butsal1/XC259767.ogg,9.0,12.0,0.153949,,566513
3,butsal1/XC259767.ogg,12.0,15.0,0.184867,,566513
4,butsal1/XC259767.ogg,15.0,18.0,0.297065,,566513
...,...,...,...,...,...,...
83,65448/XC941294.ogg,0.0,5.0,0.798836,,22976
84,65448/XC941294.ogg,1.0,6.0,0.789860,,22976
85,65448/XC941294.ogg,2.0,7.0,0.561246,,22976
86,65448/XC941294.ogg,3.0,8.0,0.748031,,22976


# **Files Originally Discluded**

This section addresses the issue that the original sliding window inference code did not process audio files shorter than the sliding window length (5 seconds) properly, resulting in missing predictions for these shorter files.

- The variable `unused_files` identifies which audio files from the dataset have not been processed yet by comparing all files with those already used in `final_df`.

- The `SlidingWindowInferenceGenerator` class is a refined version of the original sliding window generator that properly handles:
  - Audio files shorter than the chunk duration by padding the audio to the required length and generating a single chunk starting at time 0.
  - Sliding windows over longer audio files with fixed steps and padding for any last partial chunks.
  - Normalization and mel spectrogram feature extraction consistent with training.

- The generator yields batches of mel spectrogram features along with the filename and start times for the sliding windows.

- The inference loop uses this generator to predict confidence scores for each chunk using the provided model and bird-specific label encoder.

- The predicted probabilities above a threshold (0.5) are recorded with corresponding metadata, including start and end times, confidence scores, and labels.

- The results are saved to a new CSV specifically covering these previously unprocessed shorter or missing files (`non_birdnet_confidences_from_softmax_short_files.csv`).

---

**In summary:**

This improved generator ensures that no audio files—especially those shorter than the usual window length—are excluded during inference, thereby filling gaps the original inference code missed and producing a complete set of confidence predictions for all files.


In [31]:
files_used=final_df['filename'].unique()
all_files=df[df['filename'].isin(filenames_to_process)]['filename'].unique()
unused_files=[f for f in all_files if f not in files_used]
print(f'Number of unused files {len(unused_files)}')

Number of unused files 195


In [35]:
class SlidingWindowInferenceGenerator:
    def __init__(
        self,
        file_list,
        sr=32000,
        chunk_duration=5.0,
        step_duration=1.0,
        n_mels=128,
        target_time_length_spectrogram=320,
        normalize_audio=True,
        background_flag=0,
        audio_dir='train_audio'
    ):
        self.file_list = file_list
        self.sr = sr
        self.chunk_duration = chunk_duration
        self.step_duration = step_duration
        self.n_mels = n_mels
        self.target_time_length_spectrogram = target_time_length_spectrogram
        self.normalize_audio = normalize_audio
        self.background_flag = background_flag
        self.audio_dir = audio_dir

    def _pad_or_truncate_audio(self, data, target_length):
        if data.shape[0] < target_length:
            padding = np.zeros((target_length - data.shape[0],), dtype=data.dtype)
            return np.concatenate((data, padding))
        elif data.shape[0] > target_length:
            return data[:target_length]
        else:
            return data

    def _normalize(self, audio):
        peak = np.abs(audio).max()
        if peak > 0:
            return audio / peak
        return audio

    def _pad_or_truncate(self, data, target_length):
        if data.shape[0] < target_length:
            padding = np.zeros((target_length - data.shape[0], data.shape[1]), dtype=data.dtype)
            return np.concatenate((data, padding), axis=0)
        elif data.shape[0] > target_length:
            return data[:target_length]
        else:
            return data

    def _extract_features(self, audio):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sr, n_mels=self.n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T

        min_db = -80.0
        max_db = 0.0

        mel_spec_scaled = (mel_spec_db - min_db) / (max_db - min_db)
        mel_spec_scaled = np.clip(mel_spec_scaled, 0.0, 1.0)

        # Pad/truncate to fixed length and add a channel axis
        mel_spec_padded = self._pad_or_truncate(mel_spec_scaled, self.target_time_length_spectrogram)[:, :, np.newaxis]
        # Make 3-channel for EfficientNet
        mel_spec_padded_3_channel = np.repeat(mel_spec_padded, 3, axis=-1)
        return mel_spec_padded_3_channel

    def generate(self):
        for filename in self.file_list:
            try:
                audio, _ = librosa.load(os.path.join(self.audio_dir, filename), sr=self.sr)
            except Exception as e:
                print(f"Error loading {filename}: {e}. Skipping.")
                continue
            if self.normalize_audio:
                audio = self._normalize(audio)
            total_samples = len(audio)
            window_length = int(self.chunk_duration * self.sr)
            step_length = int(self.step_duration * self.sr)
            mel_features = []
            start_times = []
            if total_samples < window_length:
                # File is shorter than chunk length – pad and use single chunk at t=0
                padded_audio = self._pad_or_truncate_audio(audio, window_length)
                mel = self._extract_features(padded_audio)
                mel_features.append(mel)
                start_times.append(0.0)
            else:
                for start in range(0, total_samples - window_length + 1, step_length):
                    end = start + window_length
                    chunk = audio[start:end]
                    if len(chunk) < window_length:
                        chunk = self._pad_or_truncate_audio(chunk, window_length)
                    mel = self._extract_features(chunk)
                    mel_features.append(mel)
                    start_times.append(start / self.sr)
            mel_features = np.array(mel_features)
            if mel_features.shape[0] == 0:
                print(f"Skipping {filename}: no valid chunks")
                continue
            yield mel_features, filename, start_times

rerun = True
if rerun:
    file_list = unused_files
    audio_dir = os.path.join(main_dir,'train_audio')
    model_path = os.path.join(models_dir, 'best_model_by_val_loss_softmax.keras')
    label_encoder = joblib.load(os.path.join(supplemental_files_dir,'bird_label_encoder.joblib'))
    model = tf.keras.models.load_model(model_path)

    gen = SlidingWindowInferenceGenerator(
        file_list,
        sr=32000,
        chunk_duration=5.0,
        step_duration=1.0,
        n_mels=128,
        target_time_length_spectrogram=320,  # update to your setting
        normalize_audio=True,
        background_flag=0,
        audio_dir=audio_dir
    )

    top_n = 3
    results = []
    for mel_features, filename, start_times in gen.generate():
        preds = model.predict(mel_features)
        class_names = label_encoder.classes_
        primary_label = primary_label_mapping.get(filename, None)
        if primary_label not in class_names:
            print(f"{filename}: primary label '{primary_label}' not in class list. Skipping.")
            continue
        class_idx = np.where(class_names == primary_label)[0][0]
        for i, start_time in enumerate(start_times):
            prob = float(preds[i, class_idx])
            row = {
                "filename": filename,
                "start_time": start_time,
                "primary_label": primary_label,
                "prob": prob
            }
            results.append(row)

    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir,'sliding_window_preds_softmax_shorts.csv'), index=False)
else:
    results_df = pd.read_csv(os.path.join(csv_dir,'sliding_window_preds_softmax_shorts.csv'))
print(results_df.head())
to_join=df[['filename','isOneBird','secondary_labels']]
merged=pd.merge(results_df,to_join,on='filename')
merged['end_time']=merged['start_time']+5
merged['confidence']=merged['prob']
merged['scientific_name']=None
final_df=merged[['filename','start_time','end_time','confidence','scientific_name','primary_label']]
final_df=final_df[final_df['confidence']>0.5]
final_df.head(20)
final_df.to_csv(os.path.join(output_dir,'non_birdnet_confidences_from_softmax_short_files.csv'),index=False)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 348ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 505ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 910ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

Unnamed: 0,filename,start_time,end_time,confidence,scientific_name,primary_label
419,1564122/CSA34195.ogg,0.0,5.0,0.989044,,1564122
420,1564122/CSA34196.ogg,0.0,5.0,0.997572,,1564122
421,1564122/CSA34197.ogg,0.0,5.0,0.998753,,1564122
422,1564122/CSA34198.ogg,0.0,5.0,0.984496,,1564122
423,1564122/CSA34199.ogg,0.0,5.0,0.996664,,1564122
424,1564122/CSA34200.ogg,0.0,5.0,0.989604,,1564122
425,21116/iNat296867.ogg,0.0,5.0,0.998809,,21116
611,21211/XC896828.ogg,0.0,5.0,0.714082,,21211
612,21211/XC896860.ogg,0.0,5.0,0.959719,,21211
615,21211/XC913839.ogg,0.0,5.0,0.859291,,21211
