In [1]:
import numpy as np
import os
import glob
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


## 2 Task Outline
Instead of blindly throwing that data at machine learning algorithms, a good data scientist will first do some exploratory data analysis. In your team, we would like you to analyze the data with respect to the following aspects:

### 1. Annotator agreement: 
### How consistent are the annotations? 
### Do different annotators agree in their labels for the same fragment?

In [2]:
# Load the Dataset

label_files = glob.glob("Dataset/**/**labels.npy", recursive=True)
label_files.sort()

"""
'Dataset/comcuc/133067.labels.npy',
'Dataset/comcuc/182336.labels.npy',
'Dataset/comcuc/189031.labels.npy',
'Dataset/comcuc/240515.labels.npy',
'Dataset/comcuc/25627.labels.npy',
"""

birds = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']

In [7]:
# 1.
# Annotator agreement: 
# How consistent are the annotations? Do different annotators agree in their labels for the same fragment?

def agreementrate_by_section (section: np.array):
    # Get only the labels which are from annotators
    section = section[1:]

    # Compute the number of cases where all annotators agree
    num_agree = np.sum(section == section[0])

    # Compute the percentage agreement
    percent_agree = num_agree / len(section)

    return percent_agree

def agreementrate_by_file (file: str):

    # load the current file
    file = np.load(file)

    return list(map(agreementrate_by_section, file))

def agreementrate_by_bird (label_files: list, bird: str):

    # filter all label files paths with the mathing bird
    files = [file for file in label_files if bird in file]

    agreement_rates = np.asarray(list(map(agreementrate_by_file, files)))

    return agreement_rates





# What is the mean agreementrate ?

mean_agreement_rate = np.mean(np.asarray(list(map(agreementrate_by_file, label_files))))
print(f"The mean agreement rate is: {mean_agreement_rate}")

################################################

# What is the agreement rate for the different bids
figs = []

for bird in birds:
    agreement_rates = agreementrate_by_bird(label_files, bird)
    heatmap = go.Heatmap(z=agreement_rates, showscale=True, name= bird)
    title = f'Heatmap of Agreementrates for {bird}'
    layout = go.Layout(title=title, xaxis_title='Agreementrates', yaxis_title='Birds')
    fig = go.Figure(data=[heatmap], layout=layout)
    figs.append(fig)

fig = make_subplots(rows=2, cols=3, subplot_titles=birds)
for i, f in enumerate(figs):
    fig.add_trace(f.data[0], row=(i//3)+1, col=(i%3)+1)

fig.update_layout(height=1200, title_text='Agreementrates Heatmaps')
fig.show()

The mean agreement rate is: 0.8973675396825397
