In [2]:
from sklearn.neighbors import NearestNeighbors as NN
import numpy as np
import pandas as pd
import glob, os, random

In [3]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [4]:
emb_ids = []
avg_embs = []
bird_embs = glob.glob('../avg_bird_embeddings/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)

    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)

avg_embs = np.array(avg_embs)

In [5]:
# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)

In [6]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name']

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(20)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,32323.wav,0.0,Grey Elaenia,43289.wav,0.309872,Black Redstart,21584.wav,0.347843,Bridled White-eye,32307.wav,0.361734,Short-billed Canastero,54667.wav,0.364576,Grey-collared Becard,64913.wav,0.375718,Red-breasted Nuthatch
1,75536.wav,0.0,Yucatan Jay,43625.wav,0.402583,Common Redstart,43991.wav,0.464861,House Sparrow,1145.wav,0.482391,Clamorous Reed Warbler,33207.wav,0.509303,Grey-fronted Dove,75456.wav,0.510597,Blue-black Grassquit
2,64694.wav,0.0,Identity unknown,64961.wav,0.487772,Redwing,64832.wav,0.487772,Redwing,21946.wav,0.568243,White-throated Towhee,43573.wav,0.58808,Ring Ouzel,43856.wav,0.588646,Identity unknown
3,33404.wav,0.0,Wattled Jacana,33233.wav,0.242899,Blue-necked Jacamar,33405.wav,0.26478,Wattled Jacana,33078.wav,0.325468,Barred Owl,33280.wav,0.327791,Stripe-necked Tody-Tyrant,33210.wav,0.370849,Flame-crested Tanager
4,22500.wav,0.0,Common Cuckoo,22498.wav,0.432251,Common Cuckoo,22497.wav,0.552739,Common Cuckoo,54363.wav,0.588764,Pied-billed Grebe,32894.wav,0.642222,Common Cuckoo,11052.wav,0.673244,White-mantled Barbet
5,32665.wav,0.0,White-breasted Tapaculo,64461.wav,0.0,Green Hermit,64681.wav,0.0,Green Hermit,32995.wav,0.1303,Serra do Mar Tyrant-Manakin,43773.wav,0.441383,Rufous-bellied Thrush,32738.wav,0.524517,Temminck's Seedeater
6,75680.wav,0.0,Tawny Owl,75681.wav,0.383017,Tawny Owl,22490.wav,0.675619,Tawny Owl,43959.wav,0.684558,Tawny Owl,75878.wav,0.68946,European Storm Petrel,53813.wav,0.771804,Tropical Screech Owl
7,64803.wav,0.0,Identity unknown,22847.wav,0.345776,Yellow-browed Warbler,21889.wav,0.345776,Yellow-browed Warbler,818.wav,0.383954,Cabanis's Bunting,54265.wav,0.418559,Zitting Cisticola,33760.wav,0.418559,Zitting Cisticola
8,10955.wav,0.0,African Hill Babbler,306.wav,0.528227,Singing Cisticola,43450.wav,0.548254,Dusky-throated Antshrike,861.wav,0.558986,Black-winged Oriole,1018.wav,0.594777,African Hill Babbler,678.wav,0.596339,Grey Apalis
9,64585.wav,0.0,Golden-crowned Warbler,43686.wav,0.709176,Short-tailed Antthrush,72.wav,0.713998,Siberian Stonechat,64545.wav,0.716248,Meadow Bunting,75820.wav,0.742124,Blue-necked Jacamar,10907.wav,0.743617,White-crowned Shama


In [7]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(20)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
1121,21878.wav,0.0,Dusky Warbler,22260.wav,0.0,Mistle Thrush,1164.wav,0.269473,White-eyed Vireo,54145.wav,0.494513,Surf Scoter,43062.wav,0.579161,Soundscape,10786.wav,0.586785,Fernandina's Flicker
2599,32327.wav,0.0,Thorn-tailed Rayadito,32328.wav,0.0,Thorn-tailed Rayadito,32302.wav,0.187475,Puna Canastero,54151.wav,0.222607,Yellow-bridled Finch,43988.wav,0.223795,Greenish Elaenia,32375.wav,0.255573,Scale-throated Earthcreeper
402,33284.wav,0.0,Rufous-breasted Hermit,33034.wav,0.0,Plain Pigeon,54210.wav,0.514941,Western Barn Owl,1003.wav,0.514941,Yellow-browed Shrike-Vireo,64767.wav,0.514941,Buffy Tuftedcheek,11439.wav,0.518499,Grey-hooded Attila
2099,22052.wav,0.0,Greenish Elaenia,11791.wav,0.0,Greenish Elaenia,33016.wav,0.420234,Black-tailed Gnatcatcher,75918.wav,0.491456,Sardinian Warbler,1179.wav,0.492126,House Crow,32426.wav,0.507906,Yellow-fronted Woodpecker
2600,21711.wav,0.0,Klages's Antbird,1106.wav,0.0,Black-chested Fruiteater,1109.wav,0.0,Black-chested Fruiteater,21727.wav,0.240314,Orange-bellied Euphonia,21691.wav,0.396717,Guttulate Foliage-gleaner,21766.wav,0.437228,Sunbittern
2605,64883.wav,0.0,Identity unknown,64884.wav,0.0,Identity unknown,64581.wav,0.0,Identity unknown,11715.wav,0.275366,Many-colored Rush Tyrant,64530.wav,0.328848,Solitary Cacique,75623.wav,0.389817,Chukar Partridge
397,33278.wav,0.0,White-bellied Hummingbird,22246.wav,0.0,Tropical Gnatcatcher,11206.wav,0.0,Tropical Gnatcatcher,21720.wav,0.366909,Yellow-bellied Elaenia,22268.wav,0.469032,Southern Lapwing,21742.wav,0.4787,Tropical Screech Owl
1047,43854.wav,0.0,Identity unknown,43741.wav,0.0,Identity unknown,43871.wav,0.489304,Ashy Woodswallow,43740.wav,0.498447,Eurasian Hoopoe,43983.wav,0.508817,Identity unknown,43501.wav,0.585495,Identity unknown
2087,43250.wav,0.0,Great Spotted Woodpecker,64745.wav,0.0,Great Spotted Woodpecker,1039.wav,0.516036,Sardinian Warbler,44003.wav,0.53958,Common Nightingale,32406.wav,0.58406,Robust Woodpecker,33015.wav,0.587453,Black-tailed Gnatcatcher
394,1103.wav,0.0,Black-chested Fruiteater,12097.wav,0.0,Brown-headed Cowbird,1108.wav,0.0,Black-chested Fruiteater,1102.wav,0.0,Black-chested Fruiteater,1084.wav,0.462116,White-browed Hemispingus,11650.wav,0.534631,Urrao Antpitta


In [19]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name', 
    '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Common_Name',
    '5th_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
    | (orig_and_first[:, 0] == orig_and_first[:, 2])
    | (orig_and_first[:, 0] == orig_and_first[:, 3])
    | (orig_and_first[:, 0] == orig_and_first[:, 4])
    | (orig_and_first[:, 0] == orig_and_first[:, 5])
)
top_5_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
top_5_accuracy

0.3790893760539629