In [30]:
import numpy as np
import os
import glob
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import math


## 2 Task Outline
Instead of blindly throwing that data at machine learning algorithms, a good data scientist will first do some exploratory data analysis. In your team, we would like you to analyze the data with respect to the following aspects:

### 2. Label characteristics: 
### How are the class labels distributed? 
### Are the classes unbalanced, and how much? 
### What is the average duration of a species’ calls (or drumming)? 
### Are there large inter-/intra-class variations?

In [5]:
# Load the Dataset

# Load the Dataset

label_files = glob.glob("Dataset/**/**labels.npy", recursive=True)
label_files.sort()

"""
'Dataset/comcuc/133067.labels.npy',
'Dataset/comcuc/182336.labels.npy',
'Dataset/comcuc/189031.labels.npy',
'Dataset/comcuc/240515.labels.npy',
'Dataset/comcuc/25627.labels.npy',
"""

data_files = glob.glob("Dataset/**/**.npy", recursive=True)
data_files = [file for file in data_files if not ".label" in file]
data_files.sort()

"""
'Dataset/comcuc/133067.npy',
'Dataset/comcuc/182336.npy',
'Dataset/comcuc/189031.npy',
'Dataset/comcuc/240515.npy',
"""

birds = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']



In [26]:
def get_drumming_precent (file, bird):

    birds = {'comcuc': 1,
        'cowpig1': 2,
        'eucdov': 3,
        'eueowl1': 4,
        'grswoo': 5,
        'tawowl1': 6}

    label = birds.get(bird)

    # load the numpy file
    file = np.load(file)

    # select the first column with the majority voted labels
    maj_labels = file[:, 0]

    # compute the percentage where a drumming is labeled
    percentage = np.count_nonzero(maj_labels == label) / len(maj_labels) * 100

    return percentage

bird_drumming_distributions = {'comcuc': 1,
        'cowpig1': 2,
        'eucdov': 3,
        'eueowl1': 4,
        'grswoo': 5,
        'tawowl1': 6}

for bird in birds:
    percentages = []
    for file in [file for file in label_files if bird in file]:
        percentages.append(get_drumming_precent(file, bird))

    bird_drumming_distributions.update({bird: np.mean(np.asarray(percentages))})


bird_drumming_distributions

# Create the figure with subplots
fig = make_subplots(rows=len(bird_drumming_distributions), cols=1, subplot_titles=list(bird_drumming_distributions.keys()))

# Loop through each key-value pair and add a bar chart subplot
for i, (key, value) in enumerate(bird_drumming_distributions.items()):
    fig.add_trace(
        go.Bar(x=['drumming', 'not drumming'], y=[value, 100-value], name=key),
        row=i+1, col=1
    )

# Set the y-axis range of each subplot to 0-100
fig.update_yaxes(range=[0, 100])


# Set the layout of the figure
fig.update_layout(height=1200, width=800, title_text="Class Label Distribution by Bird")

# Show the figure
fig.show()

In [27]:
bird_drumming_distributions

{'comcuc': 29.165,
 'cowpig1': 44.77,
 'eucdov': 40.99,
 'eueowl1': 11.62,
 'grswoo': 25.555,
 'tawowl1': 21.11}

In [28]:
def label_distributions(directory, bins):
    label_distributions = []
    files = glob.glob(os.path.join('Dataset', directory, '*.labels.npy'))

    for file in files:
        # load the current file
        file_data = np.load(file)

        # create an empty list to store the current label distribution
        label_dist = []

        # get the majority labels
        majority_labels = file_data[:, 0]

        # split the majority labels into bins
        bin_size = len(majority_labels) // bins
        bins_ = [(i * bin_size, (i + 1) * bin_size) for i in range(bins)]
        bins_[-1] = (bins_[-1][0], len(majority_labels))

        # compute the label distribution for each bin
        for bin in bins_:
            bin_labels = majority_labels[bin[0]:bin[1]]
            label_dist.append(np.count_nonzero(bin_labels == bin_labels[0]))

        label_distributions.append(label_dist)

    return label_distributions

bins = 20
directories = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']
figs = []

for directory in directories:
    label_distributions_ = np.asarray(label_distributions(directory, bins))
    heatmap = go.Heatmap(z=label_distributions_, showscale=True)
    title = f'Heatmap of Annotations for {directory}'
    layout = go.Layout(title=title, xaxis_title='Annotations', yaxis_title='Files')
    fig = go.Figure(data=[heatmap], layout=layout)
    figs.append(fig)

fig = make_subplots(rows=2, cols=3, subplot_titles=directories)
for i, f in enumerate(figs):
    fig.add_trace(f.data[0], row=(i//3)+1, col=(i%3)+1)

fig.update_layout(height=1200, title_text='Annotations Heatmaps')
fig.show()


In [39]:
def drumming_lengths (file):

    # load the numpy file
    file = np.load(file)

    # select the first column with the majority voted labels
    maj_labels = file[:, 0]

    # find all consecutive annotations and store their length
    consecutive_annotations = []

    cons_labels = 0
    for label in maj_labels:
        if label != 0:
            cons_labels += 1
        elif label == 0 and cons_labels != 0:
            consecutive_annotations.append(cons_labels)
            cons_labels = 0

    if consecutive_annotations == []:
        return 0
    else:
        return np.mean(consecutive_annotations)

bird_drumming_lengths = {'comcuc': 1,
        'cowpig1': 2,
        'eucdov': 3,
        'eueowl1': 4,
        'grswoo': 5,
        'tawowl1': 6}

for bird in birds:
    drummings = []
    for file in [file for file in label_files if bird in file]:
        drummings.append(drumming_lengths(file))

    bird_drumming_lengths.update({bird: np.mean(np.asarray(drummings))})

bird_drumming_lengths = dict(sorted(bird_drumming_lengths.items(), key=lambda x: x[1]))


# Create a list of the keys and values in the dictionary
x_values = list(bird_drumming_lengths.keys())
y_values = list(bird_drumming_lengths.values())

# Create a bar plot using Plotly
fig = go.Figure()
fig.add_trace(go.Bar(x=x_values, y=y_values))

# Set the title and axis labels
fig.update_layout(title='Average Drumming Length',
                  xaxis_title='Categories', yaxis_title='Values')

# Show the plot
fig.show()