In [57]:
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.neighbors import NearestCentroid as NC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from xgboost import XGBClassifier as XGBC
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import pandas as pd
import glob, os, random

In [58]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [69]:
emb_ids = []
avg_embs = []
labels = []
bird_embs = glob.glob('../avg_bird_embeddings/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)
    
    # get species id for embedding
    label = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]

    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)
    labels.append(label)
    
avg_embs = np.array(avg_embs)
labels = np.array(labels)
print('Number of Unique Species:', np.unique(labels).shape[0])
print('Samps per Species:', Counter(labels))

Number of Unique Species: 889
Samps per Species: Counter({4017: 141, 7439: 59, 3376: 39, 2833: 30, 3981: 25, 3533: 18, 7241: 17, 6615: 16, 556: 16, 2872: 16, 9191: 16, 6940: 16, 6392: 13, 7343: 13, 4591: 12, 9336: 12, 3596: 12, 1111: 12, 7948: 12, 7156: 12, 3714: 12, 9061: 11, 3372: 11, 8229: 11, 7479: 11, 2711: 11, 4962: 11, 64: 10, 2256: 10, 7248: 10, 1164: 10, 9249: 10, 1501: 10, 9270: 10, 8545: 9, 7953: 9, 8793: 9, 2294: 9, 4882: 9, 6556: 9, 8235: 9, 7669: 9, 3524: 9, 7357: 9, 7373: 9, 1183: 9, 8228: 9, 3628: 9, 2284: 9, 3194: 9, 3526: 9, 786: 8, 8129: 8, 9073: 8, 3845: 8, 735: 8, 1171: 8, 6358: 8, 4277: 8, 2835: 8, 6189: 8, 7598: 8, 1737: 8, 4476: 8, 7834: 8, 4482: 8, 3319: 8, 7920: 7, 1097: 7, 3544: 7, 3327: 7, 1210: 7, 575: 7, 3120: 7, 1575: 7, 6008: 7, 9349: 7, 5829: 7, 6280: 7, 6323: 7, 6057: 7, 5936: 7, 6425: 7, 7615: 7, 4380: 7, 8410: 7, 5207: 7, 8494: 7, 6831: 7, 8619: 6, 9201: 6, 530: 6, 7433: 6, 2117: 6, 8045: 6, 1010: 6, 9364: 6, 6822: 6, 1311: 6, 1588: 6, 6751: 6, 2071:

In [60]:
# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)

In [61]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name']

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,22498.wav,0.0,Common Cuckoo,22497.wav,0.497725,Common Cuckoo,32894.wav,0.504404,Common Cuckoo,22500.wav,0.524576,Common Cuckoo,21577.wav,0.6774,Eurasian Bullfinch,43970.wav,0.6774,Eurasian Bullfinch
1,22569.wav,0.0,Red-cockaded Woodpecker,11482.wav,0.322953,White-bibbed Manakin,11481.wav,0.322953,White-bibbed Manakin,11480.wav,0.322953,White-bibbed Manakin,12048.wav,0.322953,White-tailed Nuthatch,22570.wav,0.498126,Pileated Woodpecker
2,53635.wav,0.0,Rusty-fronted Tody-Flycatcher,1283.wav,0.483253,Spangled Drongo,21533.wav,0.492693,Rio Branco Antbird,1282.wav,0.500347,Spangled Drongo,33556.wav,0.543564,Speckled Chachalaca,54554.wav,0.649211,Yellow-crested Woodpecker
3,53820.wav,0.0,White-eyed Vireo,53787.wav,0.316916,Caspian Tern,53786.wav,0.40688,Caspian Tern,53834.wav,0.455478,Common Yellowthroat,53812.wav,0.460618,Northern Mockingbird,22232.wav,0.460618,Arctic Warbler
4,398.wav,0.0,Piping Hornbill,738.wav,0.226663,Eastern Bearded Greenbul,75867.wav,0.226663,Fire-breasted Flowerpecker,94.wav,0.233958,Yellow-spotted Barbet,93.wav,0.264513,Yellow-spotted Barbet,95.wav,0.308983,Yellow-spotted Barbet
5,64959.wav,0.0,House Sparrow,64961.wav,0.478789,Redwing,64832.wav,0.478789,Redwing,43401.wav,0.651307,Black-winged Stilt,75091.wav,0.651307,Black-winged Stilt,43692.wav,0.670663,House Sparrow
6,21892.wav,0.0,Tropical Screech Owl,43205.wav,0.434623,Tropical Screech Owl,32550.wav,0.480303,Tropical Screech Owl,32548.wav,0.480804,Tropical Screech Owl,32547.wav,0.521826,Tropical Screech Owl,76022.wav,0.556843,Tropical Screech Owl
7,33143.wav,0.0,European Greenfinch,625.wav,0.445147,Johanna's Sunbird,64624.wav,0.594906,Dusky Warbler,12198.wav,0.610507,Dusky Warbler,21609.wav,0.610843,Erckel's Francolin,11991.wav,0.614526,Yellow-browed Warbler
8,21473.wav,0.0,White-necked Puffbird,321.wav,0.398312,Rufous-vented Paradise Flycatcher,319.wav,0.452216,Rufous-vented Paradise Flycatcher,785.wav,0.479017,Chestnut-breasted Nigrita,320.wav,0.491552,Rufous-vented Paradise Flycatcher,206.wav,0.497904,Western Nicator
9,64522.wav,0.0,Long-tailed Tit,64986.wav,0.0,Long-tailed Tit,64895.wav,0.0,Long-tailed Tit,43526.wav,0.499681,Great Tit,15.wav,0.50048,Long-tailed Tit,32951.wav,0.523373,Long-tailed Tit


In [62]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
1522,22340.wav,0.0,Western Nicator,22335.wav,0.0,Western Nicator,22321.wav,0.105565,Western Nicator,22319.wav,0.149552,Western Nicator,22338.wav,0.189703,Grey-backed Camaroptera,22339.wav,0.261559,Grey-backed Camaroptera
389,43700.wav,0.0,Great Tit,43378.wav,0.0,Great Tit,53764.wav,0.487504,Buff-bellied Pipit,54439.wav,0.50817,Lesser Goldfinch,54056.wav,0.520842,Lesser Goldfinch,864.wav,0.561814,Black-winged Oriole
914,11835.wav,0.0,Palm Tanager,685.wav,0.0,Black-fronted Tyrannulet,11626.wav,0.0,Palm Tanager,696.wav,0.545444,White-chested Emerald,1305.wav,0.665196,Manu Antbird,22107.wav,0.715074,Mountain Leaf Warbler
387,22248.wav,0.0,Eurasian Bullfinch,10829.wav,0.0,Eurasian Bullfinch,43056.wav,0.483096,Redwing,64655.wav,0.48664,Great Spotted Woodpecker,10835.wav,0.48664,Great Spotted Woodpecker,64829.wav,0.541591,Eurasian Bullfinch
1650,54194.wav,0.0,Eurasian Blackcap,33068.wav,0.0,Eurasian Blackcap,33065.wav,0.212476,Eurasian Blackcap,33067.wav,0.222613,Eurasian Blackcap,33066.wav,0.27585,Eurasian Blackcap,29.wav,0.416855,Common Nightingale
918,21575.wav,0.0,Costa's Hummingbird,53806.wav,0.0,Carolina Wren,33011.wav,0.0,Costa's Hummingbird,54073.wav,0.0,Costa's Hummingbird,75746.wav,0.543787,Carolina Wren,53779.wav,0.546697,Horned Lark
922,65059.wav,0.0,Little Spiderhunter,22492.wav,0.0,Hazel Grouse,65058.wav,0.0,Little Spiderhunter,43448.wav,0.0,Dusky-throated Antshrike,43450.wav,0.256313,Dusky-throated Antshrike,650.wav,0.350494,Grey-backed Camaroptera
383,33493.wav,0.0,Pacific Hornero,11713.wav,0.0,Many-colored Rush Tyrant,33491.wav,0.269623,Pacific Hornero,33492.wav,0.290077,Pacific Hornero,33366.wav,0.373469,D'Orbigny's Chat-Tyrant,76009.wav,0.422689,Pacific Hornero
923,33521.wav,0.0,Streak-backed Canastero,33522.wav,0.0,Streak-backed Canastero,33519.wav,0.443539,Streak-backed Canastero,33410.wav,0.453039,Rufous-webbed Bush Tyrant,32917.wav,0.53245,Bright-rumped Yellow Finch,33649.wav,0.53245,Streaked Saltator
1649,64930.wav,0.0,Soundscape,64434.wav,0.0,Soundscape,65078.wav,0.259089,Soundscape,64439.wav,0.259089,Soundscape,64437.wav,0.259089,Soundscape,64438.wav,0.280885,Soundscape


In [63]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
)
top_1_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 1 Accuracy:', top_1_accuracy)

Top 1 Accuracy: 0.4837328767123288


In [64]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name', 
    '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Common_Name',
    '5th_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
    | (orig_and_first[:, 0] == orig_and_first[:, 2])
    | (orig_and_first[:, 0] == orig_and_first[:, 3])
    | (orig_and_first[:, 0] == orig_and_first[:, 4])
    | (orig_and_first[:, 0] == orig_and_first[:, 5])
)
top_5_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 5 Accuracy:', top_5_accuracy)

Top 5 Accuracy: 0.6592465753424658


In [65]:
#tsne_embs = TSNE(n_components=2, perplexity=50).fit_transform(avg_embs)
x_train, x_test, y_train, y_test = train_test_split(avg_embs, labels, test_size=0.2)

## Nearest Centroid Classification (nearest species)

In [66]:
#tsne_embs = TSNE(n_components=2).fit_transform(avg_embs)
nc = NC()
nc.fit(x_train, y_train)
print('NC Accuracy:', nc.score(x_test, y_test))


NC Accuracy: 0.38247863247863245


## K Neighbors Classifier

In [67]:
knc = KNC(n_neighbors = 1, n_jobs=-1)
knc.fit(x_train, y_train)
print('KNC Accuracy:', knc.score(x_test, y_test))

KNC Accuracy: 0.4444444444444444


## XGB Classifier

In [68]:
xgbc = XGBC(n_estimators=100)
xgbc.fit(x_train, y_train)
predicts = xgbc.predict(x_test)
print('XGBC Accuracy:', accuracy_score(y_test, predicts))

KeyboardInterrupt: 