In [139]:
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.neighbors import NearestCentroid as NC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from xgboost import XGBClassifier as XGBC
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import pandas as pd
import glob, os, random

In [140]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [141]:
emb_ids = []
avg_embs = []
labels = []
#bird_embs = glob.glob('../avg_bird_embeddings/*')
bird_embs = glob.glob('../avg_test_bird_embs/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)
    
    # get species id for embedding
    label = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]

    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)
    labels.append(label)
    
#avg_embs = np.array(avg_embs)
#labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species:', len(set(labels)))

Number of Unique Species: 597


In [142]:
# only fit nearest neighbors with more than one neighbor
new_avg_embs = []
new_emb_ids = []
new_labels = []
for idx in range(len(labels)):
    label = labels[idx]
    if label_counts[label] > 1:
        # remove emb_id, avg_emb, and label
        new_emb_ids.append(emb_ids[idx])
        new_avg_embs.append(avg_embs[idx])
        new_labels.append(labels[idx])
        
avg_embs = new_avg_embs
emb_ids = new_emb_ids
labels = new_labels

avg_embs = np.array(avg_embs)
labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species w/ 2+ samples:', len(set(labels)))

# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)


#nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)


Number of Unique Species w/ 2+ samples: 223


In [143]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name'
]

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,54873.wav,0.0,House Wren,22940.wav,0.537388,Rufous-capped Brushfinch,33860.wav,0.566648,Western Yellow Wagtail,33859.wav,0.582542,Western Yellow Wagtail,33858.wav,0.624866,Western Yellow Wagtail,22933.wav,0.643122,Carolina Wren
1,33767.wav,0.0,House Sparrow,33772.wav,0.398894,Common Redshank,33769.wav,0.398894,Common Redshank,44113.wav,0.658956,Brown-crowned Tchagra,76119.wav,0.658956,Brown-crowned Tchagra,33770.wav,0.66411,Common Redshank
2,65262.wav,0.0,Brown Creeper,65274.wav,0.337954,Brown-headed Cowbird,65295.wav,0.446761,Yellow-breasted Chat,76231.wav,0.454823,Common Yellowthroat,65273.wav,0.456169,Brown-headed Cowbird,44158.wav,0.480242,Eurasian Blackcap
3,44049.wav,0.0,Grey Bush Chat,54829.wav,0.367517,Northern Masked Weaver,54828.wav,0.367517,Northern Masked Weaver,76117.wav,0.521951,Black Kite,12492.wav,0.546764,Lark-like Brushrunner,54827.wav,0.56667,Northern Masked Weaver
4,65261.wav,0.0,Brown Creeper,65315.wav,0.272575,Identity unknown,65383.wav,0.272575,Identity unknown,54886.wav,0.306055,Common Yellowthroat,54885.wav,0.312872,Common Yellowthroat,65401.wav,0.514853,Brambling
5,1472.wav,0.0,Black-billed Treehunter,1562.wav,0.454026,Summer Tanager,1448.wav,0.459597,Rufous-naped Greenlet,1441.wav,0.467221,Streaked Xenops,1471.wav,0.492997,Blue-fronted Lancebill,1392.wav,0.523355,Marble-faced Bristle Tyrant
6,12487.wav,0.0,Sooty Antbird,12600.wav,0.722142,Ecuadorian Thrush,1582.wav,0.723567,Ochre-breasted Antpitta,12510.wav,0.752595,White-eyed Foliage-gleaner,12562.wav,0.761531,Slate-colored Grosbeak,54832.wav,0.798817,Village Weaver
7,54898.wav,0.0,Spotted Wren,54892.wav,0.0,Spotted Wren,54890.wav,0.0,Spotted Wren,1526.wav,0.486215,Lesser Spotted Woodpecker,12410.wav,0.488172,Barred Forest Falcon,65285.wav,0.514222,Soundscape
8,23117.wav,0.0,Stout-billed Cinclodes,23119.wav,0.455343,Stout-billed Cinclodes,23084.wav,0.602766,Slaty Spinetail,23082.wav,0.602766,Slaty Spinetail,44124.wav,0.620651,Bronzed Drongo,76103.wav,0.684277,Laughing Falcon
9,33858.wav,0.0,Western Yellow Wagtail,22940.wav,0.22927,Rufous-capped Brushfinch,33859.wav,0.300676,Western Yellow Wagtail,33861.wav,0.310218,Western Yellow Wagtail,33860.wav,0.351429,Western Yellow Wagtail,44177.wav,0.493592,Song Sparrow


In [144]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
97,44181.wav,0.0,Soundscape,44180.wav,0.0,Soundscape,65285.wav,0.0,Soundscape,65283.wav,0.16997,Soundscape,65264.wav,0.192689,Brown-headed Cowbird,65271.wav,0.192689,Brown-headed Cowbird
112,12519.wav,0.0,White-collared Foliage-gleaner,22889.wav,0.0,White-collared Foliage-gleaner,76277.wav,0.502856,Black Tern,1511.wav,0.521183,Common Blackbird,76276.wav,0.529262,Black Tern,12522.wav,0.543009,White-collared Foliage-gleaner
510,65315.wav,0.0,Identity unknown,65383.wav,0.0,Identity unknown,65261.wav,0.272575,Brown Creeper,54886.wav,0.387521,Common Yellowthroat,54885.wav,0.403036,Common Yellowthroat,65401.wav,0.564196,Brambling
505,65402.wav,0.0,Large Scimitar Babbler,65404.wav,0.0,Large Scimitar Babbler,76314.wav,0.0,White-tailed Eagle,44161.wav,0.520168,Common Pheasant,44152.wav,0.54649,Great Crested Grebe,44115.wav,0.649783,Yellow-legged Gull
502,12519.wav,0.0,White-collared Foliage-gleaner,22889.wav,0.0,White-collared Foliage-gleaner,76277.wav,0.502856,Black Tern,1511.wav,0.521183,Common Blackbird,76276.wav,0.529262,Black Tern,12522.wav,0.543009,White-collared Foliage-gleaner
496,54774.wav,0.0,Brambling,44031.wav,0.0,Buff-barred Warbler,44189.wav,0.442689,White-browed Shortwing,44196.wav,0.521934,White-browed Shortwing,44193.wav,0.593567,Ashy-throated Warbler,44022.wav,0.66726,Pygmy Cupwing
214,76308.wav,0.0,Common Yellowthroat,12540.wav,0.0,Common Yellowthroat,12492.wav,0.624015,Lark-like Brushrunner,44054.wav,0.666632,Japanese Tit,12377.wav,0.701203,Grey-breasted Wood Wren,44211.wav,0.72331,Redwing
124,22943.wav,0.0,Yellow-rumped Tinkerbird,22969.wav,0.0,Yellow-rumped Tinkerbird,54798.wav,0.696961,Yellow-rumped Tinkerbird,22908.wav,0.724209,White-browed Robin-Chat,22946.wav,0.735028,Black-and-white-casqued Hornbill,22909.wav,0.767136,White-browed Robin-Chat
513,44107.wav,0.0,Common Firecrest,44105.wav,0.0,Common Firecrest,1482.wav,0.396866,Turquoise Jay,12603.wav,0.453545,Araucaria Tit-Spinetail,65294.wav,0.501653,Yellow-breasted Chat,44157.wav,0.506297,Eurasian Blackcap
125,22943.wav,0.0,Yellow-rumped Tinkerbird,22969.wav,0.0,Yellow-rumped Tinkerbird,54798.wav,0.696961,Yellow-rumped Tinkerbird,22908.wav,0.724209,White-browed Robin-Chat,22946.wav,0.735028,Black-and-white-casqued Hornbill,22909.wav,0.767136,White-browed Robin-Chat


In [145]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
)
top_1_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 1 Accuracy:', top_1_accuracy)

Top 1 Accuracy: 0.33152909336941816


In [146]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name', 
    '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Common_Name',
    '5th_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
    | (orig_and_first[:, 0] == orig_and_first[:, 2])
    | (orig_and_first[:, 0] == orig_and_first[:, 3])
    | (orig_and_first[:, 0] == orig_and_first[:, 4])
    | (orig_and_first[:, 0] == orig_and_first[:, 5])
)
top_5_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 5 Accuracy:', top_5_accuracy)

Top 5 Accuracy: 0.44248985115020295


In [147]:
#tsne_embs = TSNE(n_components=2, perplexity=50).fit_transform(avg_embs)
x_train, x_test, y_train, y_test = train_test_split(avg_embs, labels, test_size=0.2)

## Nearest Centroid Classification (nearest species)

In [148]:
#tsne_embs = TSNE(n_components=2).fit_transform(avg_embs)
nc = NC()
nc.fit(x_train, y_train)
print('NC Accuracy:', nc.score(x_test, y_test))


NC Accuracy: 0.25


## K Neighbors Classifier

In [149]:
knc = KNC(n_neighbors = 1, n_jobs=-1)
knc.fit(x_train, y_train)
print('KNC Accuracy:', knc.score(x_test, y_test))

KNC Accuracy: 0.36486486486486486


## XGB Classifier

In [150]:
#xgbc = XGBC(n_estimators=10)
#xgbc.fit(x_train, y_train)
#predicts = xgbc.predict(x_test)
#print('XGBC Accuracy:', accuracy_score(y_test, predicts))