# Analyse NZ Bird Dataset 
This script analyses the 'recordings_data.csv' metadata file created when downloading the files. 

Bear in mind, this is analysing all of the 

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
file_name = "logs/recordings_data.csv"
df = pd.read_csv(file_name)
df.head(15)

Unnamed: 0,id,generic_name,scientific_name,english_name,sex,file_url,length,license,location,bird_type,also,smp,seen,regnr,file,source
0,668542,Ninox,novaeseelandiae,Morepork,uncertain,https://xeno-canto.org/668542/download,37,//creativecommons.org/licenses/by-nc-sa/4.0/,"Whangārei_Heads,_Whangarei_District,_Northland",song,[],48000,no,,https://xeno-canto.org/668542/download,Xeno-Canto
1,638197,Ninox,novaeseelandiae,Morepork,uncertain,https://xeno-canto.org/638197/download,48,//creativecommons.org/licenses/by-nc-sa/4.0/,"Te_Tii,_Far_North_District,_Northland",song,[],48000,unknown,,https://xeno-canto.org/638197/download,Xeno-Canto
2,602419,Ninox,novaeseelandiae,Morepork,uncertain,https://xeno-canto.org/602419/download,20,//creativecommons.org/licenses/by/4.0/,"Marlborough_District,_Marlborough",song,[],44100,no,,https://xeno-canto.org/602419/download,Xeno-Canto
3,405536,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/405536/download,20,//creativecommons.org/licenses/by-nc-sa/4.0/,"Charleston,_Buller_District,_West_Coast",song,[],48000,no,,https://xeno-canto.org/405536/download,Xeno-Canto
4,404843,Ninox,novaeseelandiae,Morepork,male,https://xeno-canto.org/404843/download,77,//creativecommons.org/licenses/by-nc-sa/4.0/,"Paringa,_Westland_District,_West_Coast",song,[],48000,no,,https://xeno-canto.org/404843/download,Xeno-Canto
5,354393,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/354393/download,57,//creativecommons.org/licenses/by-nc-nd/4.0/,"Mount_Cargill,_Dunedin_City,_Otago",song,[],48000,no,,https://xeno-canto.org/354393/download,Xeno-Canto
6,114338,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/114338/download,38,//creativecommons.org/licenses/by-nc-sa/3.0/,"Bullock_Creek_Road,_Paparoa_National_Park,_Bul...",song,[],44100,yes,,https://xeno-canto.org/114338/download,Xeno-Canto
7,33770,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/33770/download,29,//creativecommons.org/licenses/by-nc-sa/3.0/,"Pakihi,_Okarito",song,[],44100,unknown,,https://xeno-canto.org/33770/download,Xeno-Canto
8,345370,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/345370/download,92,//creativecommons.org/licenses/by-nc-sa/4.0/,"Barrytown,_Buller_District,_West_Coast",song,[],44100,yes,,https://xeno-canto.org/345370/download,Xeno-Canto
9,206471,Ninox,novaeseelandiae,Morepork,,https://xeno-canto.org/206471/download,13,//creativecommons.org/licenses/by-nc-sa/4.0/,"Bullock_Creek_Road,_Paparoa_National_Park,_Bul...",song,[],48000,no,,https://xeno-canto.org/206471/download,Xeno-Canto


In [3]:
generic_names = df['generic_name']
scientific_names = df['scientific_name']
english_names = df['english_name']
licenses = df['license']

In [4]:
# Show scientific Names with count above 20
scientific_name_counts = scientific_names.value_counts()
scientific_name_counts[scientific_name_counts > 20]

scientific_name
novaeseelandiae    222
melanura            78
albicilla           44
meridionalis        41
macrocephala        39
rufusater           33
lateralis           30
longipes            26
Name: count, dtype: int64

In [5]:
# Show unique english names
unique_english_names = df['english_name'].unique()
unique_english_names

array(['Morepork', 'Kaka', 'Tui', 'Bellbird', 'Saddleback', 'Whitehead',
       'Tomtit', 'Robin', 'Silvereye'], dtype=object)

In [6]:
# License count across dataset
license_counts = licenses.value_counts()
print(license_counts)

license
//creativecommons.org/licenses/by-nc-sa/4.0/    478
//creativecommons.org/licenses/by-nc-sa/3.0/     34
//creativecommons.org/licenses/by-nc-nd/3.0/      7
//creativecommons.org/licenses/by-nc-nd/2.5/      7
//creativecommons.org/licenses/by-nc-nd/4.0/      5
//creativecommons.org/licenses/by/4.0/            4
//creativecommons.org/licenses/by-nc/4.0/         1
Name: count, dtype: int64


In [7]:
sample_rates = df['smp'].value_counts()
print(sample_rates)

smp
44100    365
48000    169
22050      1
8000       1
Name: count, dtype: int64


In [8]:
# Count number of .wav files within class subfolders (segments/{class}/), recursively
folder = "segments/"
# Get class names from subfolder names in segments/ (skip hidden/system folders)
if os.path.isdir(folder):
    classes = [name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name)) and not name.startswith('.') and not name.startswith('__')]
else:
    classes = []

class_segment_counts = {}
for class_name in classes:
    class_folder = os.path.join(folder, class_name)
    wav_count = 0
    for root, dirs, files in os.walk(class_folder):
        wav_count += sum(1 for f in files if f.endswith('.wav'))
    class_segment_counts[class_name] = wav_count

# Sort and print
overall = 0
for class_name, count in sorted(class_segment_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"Class: {class_name}, Segment Count: {count}")
    overall += count
print(f"Overall Segment Count: {overall}")

Class: morepork, Segment Count: 36671
Class: tomtit, Segment Count: 36333
Class: bellbird, Segment Count: 34097
Class: silvereye, Segment Count: 28378
Class: tui, Segment Count: 20156
Class: greywarbler, Segment Count: 16889
Class: kiwi, Segment Count: 7752
Class: robin, Segment Count: 5668
Class: kea, Segment Count: 2877
Class: whitehead, Segment Count: 1926
Class: kokako, Segment Count: 1168
Class: fantail, Segment Count: 1158
Class: kaka, Segment Count: 1149
Class: parakeet, Segment Count: 1099
Class: pukeko, Segment Count: 1080
Class: saddleback, Segment Count: 944
Class: stitchbird, Segment Count: 570
Class: yellowhead, Segment Count: 567
Class: kingfisher, Segment Count: 235
Class: kereru, Segment Count: 201
Overall Segment Count: 198918


In [9]:
# Average segment count
average_segment_count = sum(class_segment_counts.values()) / len(class_segment_counts) if class_segment_counts else 0
print(f"Average Segment Count: {average_segment_count}")

Average Segment Count: 9945.9


In [10]:
# Count amount of files in downloads/
folder = 'downloads/'
num_files = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])
print(f"Number of files in '{folder}': {num_files}")

Number of files in 'downloads/': 195597
