In [64]:
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.neighbors import NearestCentroid as NC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from xgboost import XGBClassifier as XGBC
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import pandas as pd
import glob, os, random

In [55]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [56]:
emb_ids = []
avg_embs = []
labels = []
common_names = []
#bird_embs = glob.glob('../avg_bird_embeddings/*')
bird_embs = glob.glob('../avg_test_bird_embs/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)
    
    # get species id for embedding
    label = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
    common_name = species.loc[species['species_id'] == label, 'common_name'].values[0]
    
    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)
    labels.append(label)
    common_names.append(common_name)
    
#avg_embs = np.array(avg_embs)
#labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species:', len(set(labels)))

Number of Unique Species: 1322


In [57]:
# only fit nearest neighbors with more than one neighbor
new_avg_embs = []
new_emb_ids = []
new_labels = []
new_common_names = []
for idx in range(len(labels)):
    label = labels[idx]
    if label_counts[label] > 1:
        # remove emb_id, avg_emb, and label
        new_emb_ids.append(emb_ids[idx])
        new_avg_embs.append(avg_embs[idx])
        new_labels.append(labels[idx])
        new_common_names.append(common_names[idx])
        
avg_embs = new_avg_embs
emb_ids = new_emb_ids
labels = new_labels
common_names = new_common_names

avg_embs = np.array(avg_embs)
labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species w/ 2+ samples:', len(set(labels)))

# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)


#nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)


Number of Unique Species w/ 2+ samples: 934


In [58]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name'
]

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,56856.wav,0.0,Bronze-winged Jacana,35325.wav,0.446957,Bronze-winged Jacana,23660.wav,0.446957,Baird's Trogon,2118.wav,0.490118,Bronze-winged Jacana,2875.wav,0.566925,Red-billed Firefinch,46010.wav,0.598444,Great Tit
1,36014.wav,0.0,Great Crested Grebe,36017.wav,0.217353,Great Crested Grebe,36013.wav,0.255859,Great Crested Grebe,36016.wav,0.283269,Great Crested Grebe,36009.wav,0.315527,Great Crested Grebe,36012.wav,0.34938,Great Crested Grebe
2,78845.wav,0.0,Little Grebe,78842.wav,0.0,Little Grebe,36383.wav,0.0,Little Grebe,14016.wav,0.527031,Yellow-rumped Cacique,14674.wav,0.527031,Yellow-rumped Cacique,36387.wav,0.563986,Little Grebe
3,25022.wav,0.0,Carolina Wren,76693.wav,0.0,Crested Finchbill,76753.wav,0.227812,Striated Bulbul,76765.wav,0.256554,Blyth's Shrike-babbler,76690.wav,0.277393,Crested Finchbill,76998.wav,0.277761,Black-naped Monarch
4,78047.wav,0.0,White-mantled Barbet,26889.wav,0.343451,White-mantled Barbet,55947.wav,0.359782,Eurasian Woodcock,14326.wav,0.417052,Reed Parrotbill,76224.wav,0.449918,Black-striped Sparrow,14541.wav,0.451164,Eurasian Woodcock
5,3920.wav,0.0,Common Emerald Dove,4223.wav,0.315307,Common Emerald Dove,2906.wav,0.492322,Common Emerald Dove,2905.wav,0.508529,Common Emerald Dove,34578.wav,0.5233,Band-rumped Storm Petrel,2907.wav,0.544214,Common Emerald Dove
6,16172.wav,0.0,Common Myna,16113.wav,0.314733,Purple Sunbird,16218.wav,0.336092,Indian Grey Hornbill,22934.wav,0.344937,Carolina Wren,44858.wav,0.344937,Carolina Wren,15921.wav,0.367072,Streaked Laughingthrush
7,67505.wav,0.0,Identity unknown,34007.wav,0.468908,Wild Turkey,45103.wav,0.517739,European Greenfinch,34008.wav,0.534261,Wild Turkey,33943.wav,0.539551,Sardinian Warbler,2624.wav,0.590985,Identity unknown
8,23671.wav,0.0,White-eyed Tody-Tyrant,23670.wav,0.27571,White-eyed Tody-Tyrant,23744.wav,0.451426,Dusky-throated Antshrike,23677.wav,0.488299,Cobalt-winged Parakeet,23132.wav,0.492844,Spillmann's Tapaculo,25351.wav,0.515678,Cobalt-winged Parakeet
9,46110.wav,0.0,Identity unknown,46124.wav,0.401735,Identity unknown,56320.wav,0.417063,Great Tit,55956.wav,0.417063,Great Tit,34470.wav,0.422703,Desert Lark,4346.wav,0.441803,Great Tit


In [59]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
2803,65773.wav,0.0,Identity unknown,45494.wav,0.0,Identity unknown,45496.wav,0.411122,Identity unknown,36209.wav,0.444135,Western Osprey,54915.wav,0.476907,Western Osprey,45877.wav,0.52984,Eurasian Bullfinch
1447,35344.wav,0.0,Himalayan Monal,35346.wav,0.0,Himalayan Monal,35347.wav,0.364964,Himalayan Monal,35342.wav,0.441411,Himalayan Monal,35343.wav,0.457267,Himalayan Monal,35345.wav,0.473542,Himalayan Monal
6024,4129.wav,0.0,Orange-bellied Leafbird,77170.wav,0.0,Orange-bellied Leafbird,77106.wav,0.407004,Ashy Drongo,76908.wav,0.418339,Small Niltava,76911.wav,0.418339,Small Niltava,35051.wav,0.432771,White-tailed Nuthatch
3508,2916.wav,0.0,Long-billed Spiderhunter,2917.wav,0.0,Long-billed Spiderhunter,2918.wav,0.0,Long-billed Spiderhunter,2589.wav,0.316686,Blue Nuthatch,2590.wav,0.410059,Blue Nuthatch,12612.wav,0.489137,Brown-capped Laughingthrush
1441,4517.wav,0.0,White-bellied Hummingbird,24509.wav,0.0,White-bellied Hummingbird,4301.wav,0.485694,Lesser Masked Weaver,34397.wav,0.537876,Black-headed Oriole,3738.wav,0.602269,Sooty Antbird,1745.wav,0.622592,Black-billed Amazon
1440,57045.wav,0.0,Common Yellowthroat,12563.wav,0.0,Common Yellowthroat,46087.wav,0.243713,Green-tailed Towhee,55426.wav,0.258096,Pygmy Nuthatch,55607.wav,0.290162,Townsend's Solitaire,57072.wav,0.359795,Lesser Goldfinch
1439,66144.wav,0.0,Scale-throated Earthcreeper,56385.wav,0.0,Scale-throated Earthcreeper,56386.wav,0.37909,Scale-throated Earthcreeper,56834.wav,0.387643,Great Shrike-Tyrant,56384.wav,0.39313,Scale-throated Earthcreeper,33796.wav,0.409027,Eurasian Blackcap
1437,24264.wav,0.0,Streak-capped Treehunter,27023.wav,0.0,Streak-capped Treehunter,13790.wav,0.0,Song Wren,66687.wav,0.501372,Yungas Manakin,67328.wav,0.527308,Ochre-bellied Flycatcher,35734.wav,0.527308,Ochre-bellied Flycatcher
1433,2967.wav,0.0,Green-headed Oriole,12604.wav,0.0,Ashy-throated Warbler,2966.wav,0.317002,Green-headed Oriole,33871.wav,0.567647,Broad-billed Motmot,67027.wav,0.578671,Dusky-headed Parakeet,15784.wav,0.578671,White-winged Diuca Finch
1432,3817.wav,0.0,Rufous-winged Sunbird,3868.wav,0.0,Rufous-winged Sunbird,3829.wav,0.0,Rufous-winged Sunbird,3820.wav,0.0,Rufous-winged Sunbird,78631.wav,0.275498,Abyssinian Wheatear,24423.wav,0.295907,Carolina Wren


In [60]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
)
top_1_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 1 Accuracy:', top_1_accuracy)

Top 1 Accuracy: 0.5343465951423036


In [61]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name', 
    '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Common_Name',
    '5th_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
    | (orig_and_first[:, 0] == orig_and_first[:, 2])
    | (orig_and_first[:, 0] == orig_and_first[:, 3])
    | (orig_and_first[:, 0] == orig_and_first[:, 4])
    | (orig_and_first[:, 0] == orig_and_first[:, 5])
)
top_5_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 5 Accuracy:', top_5_accuracy)

Top 5 Accuracy: 0.689316048278945


In [62]:
#tsne_embs = TSNE(n_components=2, perplexity=50).fit_transform(avg_embs)
x_train, x_test, y_train, y_test = train_test_split(avg_embs, labels, test_size=0.2)

## Nearest Centroid Classification (nearest species)

In [63]:
#tsne_embs = TSNE(n_components=2).fit_transform(avg_embs)
nc = NC()
nc.fit(x_train, y_train)
y_pred = nc.predict(x_test)
y_pred_proba = nc.predict_proba(x_test)
print('NC Accuracy:', nc.score(x_test, y_test))
print(classification_report(y_test, y_pred, labels=labels, target_names=common_names))

NC Accuracy: 0.40208488458674607
                                   precision    recall  f1-score   support

             Bronze-winged Jacana       1.00      1.00      1.00         1
              Great Crested Grebe       0.75      0.60      0.67         5
                     Little Grebe       0.50      0.12      0.20         8
                Crested Finchbill       0.00      0.00      0.00         1
             White-mantled Barbet       0.00      0.00      0.00         0
              Common Emerald Dove       1.00      1.00      1.00         1
                      Common Myna       0.67      0.57      0.62         7
                 Identity unknown       0.33      0.01      0.02        87
           White-eyed Tody-Tyrant       0.00      0.00      0.00         2
                 Identity unknown       0.33      0.01      0.02        87
         Ladder-backed Woodpecker       0.50      0.33      0.40         3
              Pileated Woodpecker       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## K Neighbors Classifier

In [66]:
knc = KNC(n_neighbors = 1, n_jobs=-1)
knc.fit(x_train, y_train)
y_pred = knc.predict(x_test)
print('KNC Accuracy:', knc.score(x_test, y_test))
print(classification_report(y_test, y_pred, labels=labels, target_names=common_names))

KNC Accuracy: 0.5137751303052867
                                   precision    recall  f1-score   support

             Bronze-winged Jacana       1.00      1.00      1.00         1
              Great Crested Grebe       0.67      0.40      0.50         5
                     Little Grebe       0.62      1.00      0.76         8
                Crested Finchbill       0.00      0.00      0.00         1
             White-mantled Barbet       0.00      0.00      0.00         0
              Common Emerald Dove       1.00      1.00      1.00         1
                      Common Myna       0.00      0.00      0.00         7
                 Identity unknown       0.68      0.66      0.67        87
           White-eyed Tody-Tyrant       0.00      0.00      0.00         2
                 Identity unknown       0.68      0.66      0.67        87
         Ladder-backed Woodpecker       0.40      0.67      0.50         3
              Pileated Woodpecker       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## XGB Classifier

In [49]:
#xgbc = XGBC(n_estimators=10)
#xgbc.fit(x_train, y_train)
#predicts = xgbc.predict(x_test)
#print('XGBC Accuracy:', accuracy_score(y_test, predicts))