In [21]:
from sklearn.neighbors import NearestNeighbors as NN
import numpy as np
import pandas as pd
import glob, os, random

In [22]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [23]:
emb_ids = []
avg_embs = []
bird_embs = glob.glob('../avg_bird_embeddings/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)

    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)

avg_embs = np.array(avg_embs)

In [24]:
# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)

In [25]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name']

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(20)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,22201.wav,0.0,Violet-tailed Sylph,22200.wav,0.0,Violet-tailed Sylph,64658.wav,0.467946,Archbold's Bowerbird,43077.wav,0.525612,Hairy-crested Antbird,21635.wav,0.58108,Screaming Piha,1305.wav,0.59073,Manu Antbird
1,32350.wav,0.0,Firewood-gatherer,32376.wav,0.284788,Scale-throated Earthcreeper,32351.wav,0.380165,Firewood-gatherer,32306.wav,0.417633,Sharp-billed Canastero,33124.wav,0.436544,Shiny Cowbird,32692.wav,0.43941,American Golden Plover
2,43778.wav,0.0,Greater Hoopoe-Lark,22441.wav,0.0,Eurasian Woodcock,22440.wav,0.0,Eurasian Woodcock,53981.wav,0.313688,Snow Goose,54364.wav,0.419102,Pied-billed Grebe,54365.wav,0.47589,Pied-billed Grebe
3,96.wav,0.0,Yellow-spotted Barbet,554.wav,0.425935,Little Grey Greenbul,398.wav,0.49187,Piping Hornbill,93.wav,0.495489,Yellow-spotted Barbet,94.wav,0.514777,Yellow-spotted Barbet,738.wav,0.523464,Eastern Bearded Greenbul
4,1351.wav,0.0,Pink Robin,1352.wav,0.332617,Pink Robin,22532.wav,0.358297,Eurasian Bullfinch,1245.wav,0.35962,Tasmanian Scrubwren,1247.wav,0.456341,Tasmanian Nativehen,1383.wav,0.557349,Pallid Cuckoo
5,54617.wav,0.0,Southern Hyliota,11204.wav,0.471901,Tropical Gnatcatcher,11885.wav,0.472882,Large Elaenia,21540.wav,0.474263,Brown-throated Parakeet,21490.wav,0.482784,Crescent-chested Puffbird,11708.wav,0.488202,Rusty-fronted Tody-Flycatcher
6,75701.wav,0.0,Grey Apalis,65024.wav,0.296527,Red-faced Crombec,75702.wav,0.318354,Grey Apalis,624.wav,0.536302,Eurasian Blackcap,22063.wav,0.571737,Pale-legged Warbler,43886.wav,0.574953,Pine Warbler
7,12009.wav,0.0,Red-throated Caracara,12007.wav,0.482483,Red-throated Caracara,22712.wav,0.542972,Red-throated Caracara,53664.wav,0.62091,Red-throated Caracara,33194.wav,0.649638,Red-throated Caracara,65091.wav,0.698926,Variable Oriole
8,32864.wav,0.0,Willis's Antbird,22719.wav,0.30439,Chestnut-crowned Foliage-gleaner,22722.wav,0.349674,Short-tailed Parrot,22234.wav,0.444711,Brazilian Tanager,32917.wav,0.539753,Bright-rumped Yellow Finch,22745.wav,0.5801,Harris's Hawk
9,43266.wav,0.0,Identity unknown,65168.wav,0.0,Identity unknown,53821.wav,0.583515,Ladder-backed Woodpecker,64802.wav,0.598267,Blackcap Babbler,53834.wav,0.615108,Common Yellowthroat,53933.wav,0.632619,Western Grebe


In [27]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(20)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,22201.wav,0.0,Violet-tailed Sylph,22200.wav,0.0,Violet-tailed Sylph,64658.wav,0.467946,Archbold's Bowerbird,43077.wav,0.525612,Hairy-crested Antbird,21635.wav,0.58108,Screaming Piha,1305.wav,0.59073,Manu Antbird
1036,64544.wav,0.0,Meadow Bunting,64403.wav,0.0,Meadow Bunting,64887.wav,0.597758,Red-throated Caracara,43098.wav,0.75383,Black-headed Antthrush,54448.wav,0.753871,Stripe-tailed Hummingbird,43206.wav,0.76915,Black-headed Antthrush
1046,22571.wav,0.0,Pileated Woodpecker,22570.wav,0.0,Pileated Woodpecker,22569.wav,0.533706,Red-cockaded Woodpecker,21889.wav,0.556919,Yellow-browed Warbler,22847.wav,0.556919,Yellow-browed Warbler,43890.wav,0.576134,Golden Babbler
1051,1097.wav,0.0,Barred Fruiteater,65251.wav,0.0,Slaty-capped Shrike-Vireo,1098.wav,0.0,Barred Fruiteater,21898.wav,0.394306,Boucard's Wren,53779.wav,0.395833,Horned Lark,54400.wav,0.447997,Bewick's Wren
1056,64745.wav,0.0,Great Spotted Woodpecker,43250.wav,0.0,Great Spotted Woodpecker,1039.wav,0.516036,Sardinian Warbler,44003.wav,0.53958,Common Nightingale,32406.wav,0.58406,Robust Woodpecker,33015.wav,0.587453,Black-tailed Gnatcatcher
1060,75671.wav,0.0,Orange-billed Sparrow,76088.wav,0.0,Orange-billed Sparrow,32309.wav,0.424729,Short-billed Canastero,32482.wav,0.486719,White-bellied Hummingbird,33279.wav,0.486719,White-bellied Hummingbird,43721.wav,0.491272,Identity unknown
1061,64626.wav,0.0,Goldcrest,64362.wav,0.0,Goldcrest,11849.wav,0.294859,Brown-breasted Bamboo Tyrant,33696.wav,0.431178,Orange-backed Troupial,33646.wav,0.453343,Black-throated Antbird,12149.wav,0.479853,Plain-crested Elaenia
1063,43029.wav,0.0,Identity unknown,64399.wav,0.0,Identity unknown,33609.wav,0.45361,Inca Wren,22075.wav,0.61702,Spanish Sparrow,22074.wav,0.633681,Spanish Sparrow,33649.wav,0.639623,Streaked Saltator
1079,10829.wav,0.0,Eurasian Bullfinch,22248.wav,0.0,Eurasian Bullfinch,43056.wav,0.408742,Redwing,64655.wav,0.537294,Great Spotted Woodpecker,10835.wav,0.537294,Great Spotted Woodpecker,43017.wav,0.565299,Boreal Owl
1081,64572.wav,0.0,Guianan Puffbird,53649.wav,0.0,Black-billed Wood Dove,75891.wav,0.235705,Black-headed Oriole,43010.wav,0.396025,Identity unknown,33687.wav,0.463837,Temminck's Stint,53755.wav,0.487577,Whooper Swan
