In [1]:
from collections import Counter
import sys
sys.path.append('..')

from chorus.data import load_saved_xeno_canto_meta

In [2]:
df = load_saved_xeno_canto_meta()

In [3]:
df.shape

(48676, 28)

In [4]:
df.columns.tolist()

['id',
 'gen',
 'sp',
 'ssp',
 'en',
 'rec',
 'cnt',
 'loc',
 'lat',
 'lng',
 'alt',
 'type',
 'url',
 'file',
 'file-name',
 'sono',
 'lic',
 'q',
 'length',
 'time',
 'date',
 'uploaded',
 'also',
 'rmk',
 'bird-seen',
 'playback-used',
 'length-seconds',
 'scientific-name']

In [5]:
df.head()

Unnamed: 0,id,gen,sp,ssp,en,rec,cnt,loc,lat,lng,...,length,time,date,uploaded,also,rmk,bird-seen,playback-used,length-seconds,scientific-name
0,31267,Melospiza,georgiana,,Swamp Sparrow,Allen T. Chartier,United States,"Michigan, Nyangquing Point Wildlife Area",43.7745,-83.9378,...,0:04,?,2008-06-27,2009-03-19,"[Agelaius phoeniceus, Melospiza melodia]",,unknown,unknown,4,Melospiza georgiana
1,215463,Gymnorhinus,cyanocephalus,,Pinyon Jay,Kristie Nelson,United States,"Mono Mills, Inyo National Forest, Mono County,...",37.8853,-118.9601,...,1:15,14:00,2015-01-29,2015-03-03,"[Sitta pygmaea, Nucifraga columbiana, Poecile ...",Flock of about 300 foraging in Jeffery Pine fo...,yes,no,75,Gymnorhinus cyanocephalus
2,196885,Polioptila,caerulea,,Blue-grey Gnatcatcher,Dan Lane,United States,"Saguaro National Park (near Tucson), Pima Cou...",32.209,-110.7103,...,0:54,06:30,2014-08-25,2014-09-28,"[Passerina caerulea, Toxostoma curvirostre, Ca...",Natural calls (or response to pishing?) from a...,no,no,54,Polioptila caerulea
3,187768,Sitta,canadensis,,Red-breasted Nuthatch,Paul Driver,United States,"Seawall, ME",44.228,-68.312,...,1:10,07:49,2014-07-11,2014-07-18,[],"possible juvenile, calls with higher pitched n...",yes,no,70,Sitta canadensis
4,457223,Corvus,ossifragus,,Fish Crow,Bruce Lagerquist,United States,"Joe Overstreet Rd, Lake Kissimmee, Osceola Cou...",27.9368,-81.2252,...,0:12,11:30,2017-03-06,2019-02-19,[],Bird sitting on light post in parking lot,yes,no,12,Corvus ossifragus


In [6]:
df['en'].unique().size

803

In [7]:
df['en'].value_counts().head(10)

Identity unknown        847
Red Crossbill           556
Song Sparrow            552
Northern Cardinal       458
Carolina Wren           424
Spotted Towhee          400
American Robin          385
Bewick's Wren           378
House Wren              367
Red-winged Blackbird    357
Name: en, dtype: int64

In [8]:
true_bird_instances = df['scientific-name'].tolist()
for also in df['also']:
    true_bird_instances.extend(filter(len, also))
Counter(true_bird_instances).most_common(10)

[('Cardinalis cardinalis', 2518),
 ('Agelaius phoeniceus', 2202),
 ('Turdus migratorius', 1883),
 ('Melospiza melodia', 1750),
 ('Zenaida macroura', 1545),
 ('Haemorhous mexicanus', 1327),
 ('Geothlypis trichas', 1321),
 ('Thryothorus ludovicianus', 1300),
 ('Corvus brachyrhynchos', 1267),
 ('Cyanocitta cristata', 1146)]

In [9]:
scientific_to_en = (
    df.drop_duplicates(subset=['scientific-name'])
    .set_index('scientific-name')['en']
    .to_dict()
)

In [10]:
Counter([scientific_to_en.get(bird, 'incidental') for bird in true_bird_instances]).most_common(20)

[('Northern Cardinal', 2518),
 ('Red-winged Blackbird', 2202),
 ('American Robin', 1883),
 ('Song Sparrow', 1750),
 ('Mourning Dove', 1545),
 ('House Finch', 1327),
 ('Common Yellowthroat', 1321),
 ('Carolina Wren', 1300),
 ('American Crow', 1267),
 ('Blue Jay', 1146),
 ('American Yellow Warbler', 1146),
 ('Spotted Towhee', 1114),
 ('House Wren', 1066),
 ('White-winged Dove', 1021),
 ("Bewick's Wren", 969),
 ('Red-eyed Vireo', 876),
 ('Northern Mockingbird', 876),
 ('Tufted Titmouse', 870),
 ('Identity unknown', 847),
 ('Western Meadowlark', 844)]

In [11]:
df_filtered = df.loc[
    df['q'].isin(['A', 'B'])
    & (df['also'].apply(lambda x: x == ['']))
    & (df['en'] != 'Identity unknown')
]

In [12]:
df_filtered['en'].value_counts().head(10)

Song Sparrow            212
Carolina Wren           185
Northern Cardinal       182
American Robin          175
Red Crossbill           162
Red-winged Blackbird    161
Bewick's Wren           153
House Wren              153
Dark-eyed Junco         153
Blue Jay                151
Name: en, dtype: int64

In [13]:
df_filtered['en'].value_counts().head(10).sum()

1687

In [14]:
df_filtered['en'].value_counts().head(20)

Song Sparrow             212
Carolina Wren            185
Northern Cardinal        182
American Robin           175
Red Crossbill            162
Red-winged Blackbird     161
Bewick's Wren            153
House Wren               153
Dark-eyed Junco          153
Blue Jay                 151
Spotted Towhee           140
Tufted Titmouse          130
Great Horned Owl         128
Northern Saw-whet Owl    118
Grey Catbird             116
Northern Mockingbird     108
Marsh Wren               108
American Crow            107
Common Yellowthroat      106
Northern Raven           104
Name: en, dtype: int64

In [15]:
df_filtered['en'].value_counts().head(20).sum()

2852