In [127]:
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.neighbors import NearestCentroid as NC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from xgboost import XGBClassifier as XGBC
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import pandas as pd
import glob, os, random

In [128]:
df_song_species = pd.read_csv('../preprocess_data/species_and_record_url.csv')
species = pd.read_csv('../preprocess_data/species_keys.csv')

In [129]:
emb_ids = []
avg_embs = []
labels = []
bird_embs = glob.glob('../avg_bird_embeddings/*')
random.shuffle(bird_embs)
for avg_emb_path in bird_embs:
    emb_id = os.path.basename(avg_emb_path).replace('.npy','')
    emb_id = int(emb_id)

    # load d-vector
    avg_emb = np.load(avg_emb_path)
    
    # get species id for embedding
    label = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]

    emb_ids.append(emb_id)
    avg_embs.append(avg_emb)
    labels.append(label)
    
#avg_embs = np.array(avg_embs)
#labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species:', len(set(labels)))

Number of Unique Species: 2310


In [130]:
# only fit nearest neighbors with more than one neighbor
new_avg_embs = []
new_emb_ids = []
new_labels = []
for idx in range(len(labels)):
    label = labels[idx]
    if label_counts[label] > 1:
        # remove emb_id, avg_emb, and label
        new_emb_ids.append(emb_ids[idx])
        new_avg_embs.append(avg_embs[idx])
        new_labels.append(labels[idx])
        
avg_embs = new_avg_embs
emb_ids = new_emb_ids
labels = new_labels

avg_embs = np.array(avg_embs)
labels = np.array(labels)
label_counts = dict(Counter(labels))
print('Number of Unique Species w/ 2+ samples:', len(set(labels)))

# fit nearest_neighbors
nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)
nn.fit(avg_embs)
dists, inds = nn.kneighbors(avg_embs)


#nn = NN(n_neighbors=6, metric='euclidean', n_jobs=-1)


Number of Unique Species w/ 2+ samples: 1302


In [131]:
data_neighbors = []
count = 0
for neighbor_matches, dist_matches in zip(inds.tolist(), dists.tolist()):
    neighbor_row = []
    for n, dist in zip(range(0, len(neighbor_matches)), dist_matches):
        emb_id = emb_ids[neighbor_matches[n]]

        # get species of embedding
        species_id = df_song_species.loc[df_song_species['recording_id'] == emb_id, 'species_id'].values[0]
        species_name = species.loc[species['species_id'] == species_id, 'common_name'].values[0]
        neighbor_row.append(str(emb_id) + '.wav')
        neighbor_row.append(dist)
        neighbor_row.append(species_name)

    # append neighbors row to main data list
    data_neighbors.append(neighbor_row)

cols = [
    'Orignal_Recording', 'Original_Distance', 'Original_Common_Name',
    '1st_Neighbor_Recording', '1st_Neighbor_Distance', '1st_Neighbor_Common_Name',
    '2nd_Neighbor_Recording', '2nd_Neighbor_Distance', '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Recording', '3rd_Neighbor_Distance', '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Recording', '4th_Neighbor_Distance', '4th_Neighbor_Common_Name',
    '5th_Neighbor_Recording', '5th_Neighbor_Distance', '5th_Neighbor_Common_Name'
]

df_neighbors = pd.DataFrame(data_neighbors, columns=cols)
df_neighbors.head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
0,64427.wav,0.0,Brown-crested Flycatcher,64415.wav,0.191742,Brown-crested Flycatcher,64422.wav,0.224408,Brown-crested Flycatcher,64421.wav,0.230808,Brown-crested Flycatcher,64425.wav,0.23804,Brown-crested Flycatcher,64412.wav,0.239576,Brown-crested Flycatcher
1,53787.wav,0.0,Caspian Tern,53820.wav,0.316916,White-eyed Vireo,22232.wav,0.412227,Arctic Warbler,53786.wav,0.434734,Caspian Tern,53633.wav,0.462346,Caspian Tern,53834.wav,0.466025,Common Yellowthroat
2,947.wav,0.0,Banded Prinia,948.wav,0.357594,Banded Prinia,331.wav,0.38845,Blue-billed Malimbe,690.wav,0.4034,Green Longtail,831.wav,0.419271,Brown-throated Wattle-eye,537.wav,0.430915,Mountain Sooty Boubou
3,1365.wav,0.0,Eurasian Three-toed Woodpecker,1367.wav,0.105924,Eurasian Three-toed Woodpecker,1366.wav,0.109703,Eurasian Three-toed Woodpecker,1364.wav,0.17026,Eurasian Three-toed Woodpecker,43050.wav,0.351381,Eurasian Three-toed Woodpecker,22365.wav,0.3702,Eurasian Three-toed Woodpecker
4,202.wav,0.0,Black-winged Oriole,862.wav,0.210151,Black-winged Oriole,1019.wav,0.23799,African Hill Babbler,842.wav,0.243252,Brown-crowned Tchagra,182.wav,0.275837,White-headed Wood Hoopoe,838.wav,0.28055,Brown-throated Wattle-eye
5,11265.wav,0.0,Sharpbill,11264.wav,0.501693,Sharpbill,11263.wav,0.635595,Sharpbill,11260.wav,0.636026,Chestnut-crowned Becard,32206.wav,0.636665,Chestnut-capped Blackbird,65171.wav,0.667607,House Sparrow
6,33528.wav,0.0,Sharp-billed Canastero,32305.wav,0.0,Sharp-billed Canastero,65078.wav,0.458395,Soundscape,64437.wav,0.458395,Soundscape,64439.wav,0.458395,Soundscape,64434.wav,0.519355,Soundscape
7,75802.wav,0.0,Brown-flanked Bush Warbler,75642.wav,0.569593,Brown-flanked Bush Warbler,22019.wav,0.609358,Yellow-breasted Chat,21997.wav,0.709382,Streak-breasted Treehunter,22001.wav,0.713102,Large-footed Finch,21550.wav,0.730697,Tawny-faced Gnatwren
8,75405.wav,0.0,Common Redshank,33681.wav,0.0,Common Redshank,43400.wav,0.0,Common Redshank,75404.wav,0.184107,Common Redshank,75403.wav,0.232951,Common Redshank,75771.wav,0.250691,Common Redshank
9,65177.wav,0.0,Yellow-bellied Elaenia,12169.wav,0.376486,Yellow-bellied Elaenia,64898.wav,0.3957,Yellow-chinned Spinetail,11882.wav,0.454865,Tawny-crowned Pygmy Tyrant,32396.wav,0.491967,Scimitar-billed Woodcreeper,32807.wav,0.50342,Aplomado Falcon


In [132]:
df_neighbors.sort_values(['1st_Neighbor_Distance'], ascending=[True]).head(10)

Unnamed: 0,Orignal_Recording,Original_Distance,Original_Common_Name,1st_Neighbor_Recording,1st_Neighbor_Distance,1st_Neighbor_Common_Name,2nd_Neighbor_Recording,2nd_Neighbor_Distance,2nd_Neighbor_Common_Name,3rd_Neighbor_Recording,3rd_Neighbor_Distance,3rd_Neighbor_Common_Name,4th_Neighbor_Recording,4th_Neighbor_Distance,4th_Neighbor_Common_Name,5th_Neighbor_Recording,5th_Neighbor_Distance,5th_Neighbor_Common_Name
4695,21675.wav,0.0,Indigo Flowerpiercer,10998.wav,0.0,Indigo Flowerpiercer,10999.wav,0.0,Indigo Flowerpiercer,21677.wav,0.123713,Indigo Flowerpiercer,21580.wav,0.736576,Northern Waterthrush,21777.wav,0.736576,Northern Waterthrush
5263,22464.wav,0.0,Red Crossbill,21731.wav,0.0,Ferruginous Pygmy Owl,32543.wav,0.0,Ferruginous Pygmy Owl,32651.wav,0.386311,Song Thrush,32591.wav,0.391226,Eurasian Wren,32663.wav,0.410917,Eurasian Wren
2870,53707.wav,0.0,Eurasian Blue Tit,43710.wav,0.0,Eurasian Blue Tit,53705.wav,0.0,Eurasian Blue Tit,53722.wav,0.0,Eurasian Blue Tit,53723.wav,0.322314,Eurasian Blue Tit,53715.wav,0.37991,Eurasian Blue Tit
1156,32467.wav,0.0,Green Kingfisher,53887.wav,0.0,Green Kingfisher,54063.wav,0.431066,Anna's Hummingbird,54452.wav,0.442747,Great Blue Heron,53814.wav,0.450253,Golden-fronted Woodpecker,303.wav,0.455134,Singing Cisticola
2880,43111.wav,0.0,Black-billed Thrush,43070.wav,0.0,Black-billed Thrush,33708.wav,0.469926,Black-billed Thrush,12102.wav,0.470438,Brown Creeper,11346.wav,0.523729,Lemon-chested Greenlet,75170.wav,0.529082,Orange Ground Thrush
2883,75487.wav,0.0,Northern Pintail,54053.wav,0.0,Northern Pintail,53816.wav,0.361532,Long-billed Thrasher,54063.wav,0.375148,Anna's Hummingbird,76003.wav,0.418508,Black Phoebe,54064.wav,0.425455,Anna's Hummingbird
2885,65226.wav,0.0,Sinaloa Crow,65223.wav,0.0,Sinaloa Crow,65012.wav,0.241048,Inca Dove,64775.wav,0.259269,White-winged Dove,64409.wav,0.265394,Brown-crested Flycatcher,64776.wav,0.270405,White-winged Dove
2889,33525.wav,0.0,Line-fronted Canastero,33526.wav,0.0,Line-fronted Canastero,33582.wav,0.275029,Glossy-black Thrush,33524.wav,0.4191,Junin Canastero,33542.wav,0.464979,Thorn-tailed Rayadito,33421.wav,0.50142,Ash-colored Tapaculo
2895,33215.wav,0.0,Iberian Chiffchaff,43587.wav,0.0,Iberian Chiffchaff,42949.wav,0.0,Iberian Chiffchaff,33216.wav,0.0,Iberian Chiffchaff,32929.wav,0.543766,Iberian Chiffchaff,33217.wav,0.554361,Iberian Chiffchaff
5788,76038.wav,0.0,Red-winged Blackbird,10925.wav,0.0,Red-winged Blackbird,75091.wav,0.568322,Black-winged Stilt,43401.wav,0.568322,Black-winged Stilt,32963.wav,0.59197,Verdin,75089.wav,0.606371,Fulvous Whistling Duck


In [133]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
)
top_1_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 1 Accuracy:', top_1_accuracy)

Top 1 Accuracy: 0.5108125819134993


In [134]:
orig_and_first = df_neighbors[[
    'Original_Common_Name', 
    '1st_Neighbor_Common_Name', 
    '2nd_Neighbor_Common_Name',
    '3rd_Neighbor_Common_Name',
    '4th_Neighbor_Common_Name',
    '5th_Neighbor_Common_Name'
]].values

num_matches = np.where(
    (orig_and_first[:, 0] == orig_and_first[:, 1]) 
    | (orig_and_first[:, 0] == orig_and_first[:, 2])
    | (orig_and_first[:, 0] == orig_and_first[:, 3])
    | (orig_and_first[:, 0] == orig_and_first[:, 4])
    | (orig_and_first[:, 0] == orig_and_first[:, 5])
)
top_5_accuracy = num_matches[0].shape[0] / orig_and_first.shape[0]
print('Top 5 Accuracy:', top_5_accuracy)

Top 5 Accuracy: 0.6762778505897772


In [135]:
#tsne_embs = TSNE(n_components=2, perplexity=50).fit_transform(avg_embs)
x_train, x_test, y_train, y_test = train_test_split(avg_embs, labels, test_size=0.2)

## Nearest Centroid Classification (nearest species)

In [136]:
#tsne_embs = TSNE(n_components=2).fit_transform(avg_embs)
nc = NC()
nc.fit(x_train, y_train)
print('NC Accuracy:', nc.score(x_test, y_test))


NC Accuracy: 0.4561834561834562


## K Neighbors Classifier

In [137]:
knc = KNC(n_neighbors = 1, n_jobs=-1)
knc.fit(x_train, y_train)
print('KNC Accuracy:', knc.score(x_test, y_test))

KNC Accuracy: 0.4774774774774775


## XGB Classifier

In [138]:
#xgbc = XGBC(n_estimators=10)
#xgbc.fit(x_train, y_train)
#predicts = xgbc.predict(x_test)
#print('XGBC Accuracy:', accuracy_score(y_test, predicts))