In [1]:
import numpy as np
import os
import glob
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


In [2]:
# Load the Dataset

label_files = glob.glob("Dataset/**/**labels.npy", recursive=True)
label_files.sort()
label_files

['Dataset/comcuc/133067.labels.npy',
 'Dataset/comcuc/182336.labels.npy',
 'Dataset/comcuc/189031.labels.npy',
 'Dataset/comcuc/240515.labels.npy',
 'Dataset/comcuc/25627.labels.npy',
 'Dataset/comcuc/25629.labels.npy',
 'Dataset/comcuc/263382.labels.npy',
 'Dataset/comcuc/26379.labels.npy',
 'Dataset/comcuc/277236.labels.npy',
 'Dataset/comcuc/310525.labels.npy',
 'Dataset/comcuc/318234.labels.npy',
 'Dataset/comcuc/318483.labels.npy',
 'Dataset/comcuc/319117.labels.npy',
 'Dataset/comcuc/369543.labels.npy',
 'Dataset/comcuc/369573.labels.npy',
 'Dataset/comcuc/369578.labels.npy',
 'Dataset/comcuc/371677.labels.npy',
 'Dataset/comcuc/371679.labels.npy',
 'Dataset/comcuc/374165.labels.npy',
 'Dataset/comcuc/375811.labels.npy',
 'Dataset/comcuc/412687.labels.npy',
 'Dataset/comcuc/412688.labels.npy',
 'Dataset/comcuc/412689.labels.npy',
 'Dataset/comcuc/413167.labels.npy',
 'Dataset/comcuc/477381.labels.npy',
 'Dataset/comcuc/477384.labels.npy',
 'Dataset/comcuc/477588.labels.npy',
 'Da

## 2 Task Outline
Instead of blindly throwing that data at machine learning algorithms, a good data scientist will first do some exploratory data analysis. In your team, we would like you to analyze the data with respect to the following aspects:

1. Annotator agreement: How consistent are the annotations? Do different annota- tors agree in their labels for the same fragment?

2. Label characteristics: How are the class labels distributed? Are the classes unbalanced, and how much? What is the average duration of a species’ calls (or drumming)? Are there large inter-/intra-class variations?

3. Feature characteristics: How are the features distributed? Are there any pairs or subsets of features that seem highly correlated or redundant?

4. Feature/Label agreement: Which features seem useful for classification? Which ones are correlated with the labels?

5. Consequences: Any conclusions you can draw from your analysis for doing clas- sification?


For this, use any kind of statistical computation or visualization that you find enlight- ening. Compile your results into a report, in the form of a slide deck, with at most 7 slides (plus a title slide that includes your team name and member names). Make sure to address all five aspects in your report.
Submit your slide deck as a PDF on Moodle by April 14th. Only one team member needs to submit on behalf of the team.


In [3]:
# 1.
# Annotator agreement: 
# How consistent are the annotations? Do different annotators agree in their labels for the same fragment?

def agreementrate_by_section (section: np.array):
    # Get only the labels which are from annotators
    section = section[1:]

    # Compute the number of cases where all annotators agree
    num_agree = np.sum(section == section[0])

    # Compute the percentage agreement
    percent_agree = num_agree / len(section)

    return percent_agree

def agreementrate_by_file (file: str):

    # load the current file
    file = np.load(file)

    return list(map(agreementrate_by_section, file))

agreement_rates = np.asarray(list(map(agreementrate_by_file, label_files)))

"""
agreement_rates = []

# iterate through the files
for file in files:
    
    # load the current file
    file = np.load(file)

    # create an empty list to store the current agreementrate
    agreement_rate = []

    # iterate through all 100 sections of the file
    for section in file:

        # Get only the labels which are from annotators
        section = section[1:]

        # Compute the number of cases where all annotators agree
        num_agree = np.sum(section == section[0])

        # Compute the percentage agreement
        percent_agree = num_agree / len(section)

        agreement_rate.append(percent_agree)

    agreement_rates.append(agreement_rate)

agreement_rates = np.asarray(agreement_rates)

"""

# What is the mean agreementrate ?

mean_agreement_rate = np.mean(agreement_rates)
print(f"The mean agreement rate is: {mean_agreement_rate}")


# Plot the agreement rates in a heatmap
heatmap = go.Heatmap(z=agreement_rates)

# Define the layout
layout = go.Layout(
    title='Heatmap of Annotations',
    xaxis=dict(title='Annotations'),
    yaxis=dict(title='Files'),
    height= 1000
)

# Define the figure
fig = go.Figure(data=[heatmap], layout=layout)

# Display the figure
fig.show()

The mean agreement rate is: 0.8973675396825397


In [4]:
# 2. 
# Label characteristics: 
# How are the class labels distributed? 
# Are the classes unbalanced, and how much? 
# What is the average duration of a species’ calls (or drumming)? Are there large inter-/intra-class variations?



# How are the class labels distributed?


In [5]:
def maj_label_by_section (section: np.array):
    return section[0]

def maj_label_by_file (file: str):

    # load the current file
    file = np.load(file)

    return list(map(maj_label_by_section, file))

maj_label = np.asarray(list(map(maj_label_by_file, label_files)))


# Plot the maj_label in a heatmap
heatmap = go.Heatmap(z=agreement_rates)

# Define the layout
layout = go.Layout(
    title='Heatmap of Annotations',
    xaxis=dict(title='Annotations'),
    yaxis=dict(title='Files'),
    height= 1000
)

# Define the figure
fig = go.Figure(data=[heatmap], layout=layout)

# Display the figure
fig.show()

In [6]:
def split_into_bins(lst, n):
    bin_size = len(lst) // n
    bins = [(i * bin_size, (i + 1) * bin_size) for i in range(n)]
    bins[-1] = (bins[-1][0], len(lst)) # make sure last bin includes any remaining elements
    return bins

def label_distributions(files, bins):

    label_distributions = []

    # iterate through the files
    for file in files:
        
        # load the current file
        file = np.load(file)

        # create an empty list to store the current agreementrate
        label_freq = []

        # majority label 
        majority_labels = file[:, 0]

        # ditributions based on bins
        bins_ = split_into_bins(majority_labels, bins)

        # iterate through the bins and compute the label
        for bin in bins_:

            # compute the amount of the occuring label in the bin
            label_freq.append(np.count_nonzero(majority_labels[bin[0]:bin[1]]))
            
        label_distributions.append(label_freq)
    
    return label_distributions

label_distributions_ = np.asarray(label_distributions(label_files, 20))

# Plot the maj_label in a heatmap
heatmap = go.Heatmap(z=label_distributions_)

# Define the layout
layout = go.Layout(
    title='Heatmap of Annotations',
    xaxis=dict(title='Annotations'),
    yaxis=dict(title='Files'),
    height= 1000
)

# Define the figure
fig = go.Figure(data=[heatmap], layout=layout)

# Display the figure
fig.show()

In [7]:
def label_distributions(directory, bins):
    label_distributions = []
    files = glob.glob(os.path.join('Dataset', directory, '*.labels.npy'))

    for file in files:
        # load the current file
        file_data = np.load(file)

        # create an empty list to store the current label distribution
        label_dist = []

        # get the majority labels
        majority_labels = file_data[:, 0]

        # split the majority labels into bins
        bin_size = len(majority_labels) // bins
        bins_ = [(i * bin_size, (i + 1) * bin_size) for i in range(bins)]
        bins_[-1] = (bins_[-1][0], len(majority_labels))

        # compute the label distribution for each bin
        for bin in bins_:
            bin_labels = majority_labels[bin[0]:bin[1]]
            label_dist.append(np.count_nonzero(bin_labels == bin_labels[0]))

        label_distributions.append(label_dist)

    return label_distributions

bins = 10
directories = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']
figs = []

for directory in directories:
    label_distributions_ = np.asarray(label_distributions(directory, bins))
    heatmap = go.Heatmap(z=label_distributions_, showscale=True)
    title = f'Heatmap of Annotations for {directory}'
    layout = go.Layout(title=title, xaxis_title='Annotations', yaxis_title='Files')
    fig = go.Figure(data=[heatmap], layout=layout)
    figs.append(fig)

fig = make_subplots(rows=2, cols=3, subplot_titles=directories)
for i, f in enumerate(figs):
    fig.add_trace(f.data[0], row=(i//3)+1, col=(i%3)+1)

fig.update_layout(height=1200, title_text='Annotations Heatmaps')
fig.show()


In [8]:
data_files = glob.glob("Dataset/**/**.npy", recursive=True)
data_files = [file for file in data_files if not ".label" in file]
data_files.sort()

data_files

['Dataset/comcuc/133067.npy',
 'Dataset/comcuc/182336.npy',
 'Dataset/comcuc/189031.npy',
 'Dataset/comcuc/240515.npy',
 'Dataset/comcuc/25627.npy',
 'Dataset/comcuc/25629.npy',
 'Dataset/comcuc/263382.npy',
 'Dataset/comcuc/26379.npy',
 'Dataset/comcuc/277236.npy',
 'Dataset/comcuc/310525.npy',
 'Dataset/comcuc/318234.npy',
 'Dataset/comcuc/318483.npy',
 'Dataset/comcuc/319117.npy',
 'Dataset/comcuc/369543.npy',
 'Dataset/comcuc/369573.npy',
 'Dataset/comcuc/369578.npy',
 'Dataset/comcuc/371677.npy',
 'Dataset/comcuc/371679.npy',
 'Dataset/comcuc/374165.npy',
 'Dataset/comcuc/375811.npy',
 'Dataset/comcuc/412687.npy',
 'Dataset/comcuc/412688.npy',
 'Dataset/comcuc/412689.npy',
 'Dataset/comcuc/413167.npy',
 'Dataset/comcuc/477381.npy',
 'Dataset/comcuc/477384.npy',
 'Dataset/comcuc/477588.npy',
 'Dataset/comcuc/478127.npy',
 'Dataset/comcuc/501461.npy',
 'Dataset/comcuc/519974.npy',
 'Dataset/comcuc/53835.npy',
 'Dataset/comcuc/546326.npy',
 'Dataset/comcuc/555661.npy',
 'Dataset/comc

In [11]:
exemplary_data_file = np.load('Dataset/comcuc/133067.npy')
exemplary_data_file

array([[1.6678293e-01, 7.9440288e-02, 4.9095706e+02, ..., 3.2615812e+00,
        3.1755190e+00, 1.3717393e+00],
       [2.4142019e-01, 2.6629858e-02, 4.6741357e+02, ..., 2.3783600e+00,
        1.9217482e+00, 2.0721755e+00],
       [2.6450893e-01, 2.3550166e-02, 5.1184506e+02, ..., 3.0581174e+00,
        1.7401990e+00, 1.9776495e+00],
       ...,
       [3.2463726e-01, 4.7174446e-02, 4.7790024e+03, ..., 4.0433292e+00,
        2.3245735e+00, 2.4929183e+00],
       [1.4118303e-01, 7.1851298e-02, 5.7130255e+02, ..., 3.1507854e+00,
        1.8259456e+00, 8.8204002e-01],
       [1.2723215e-01, 9.2595160e-02, 5.8492908e+02, ..., 4.5093136e+00,
        4.0177093e+00, 1.4633849e+00]], dtype=float32)

In [9]:
# 3. 
# Feature characteristics: 
# How are the features distributed? 
# Are there any pairs or subsets of features that seem highly correlated or redundant?