# Consolidate & clean Salami annotations

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

ROOT = os.path.join(os.getcwd(), '..', '..')

## Load local modules

In [2]:
from nnssa.constants import *
from nnssa.labels import load_annotations

## Load dataset

In [3]:
salami = pd.read_csv(os.path.join(ROOT, INTER_DIR, 'salami.csv'))
salami.head()

Unnamed: 0,File,Song_duration,Song_title,Artist,Format
0,956,320.0,Revolution,Burden_Hand,mp3
1,958,344.0,Rap_Medley,5point,mp3
2,960,293.0,Woman_Across_The_River,Absolute_Gruv,mp3
3,962,579.0,_,Afrissippi,mp3
4,964,161.0,Encore_Break_Crowd_Noise,Against_Me_,mp3


## Get labels path

In [4]:
def get_label_path(filename, first = True):
    if first:
        text = os.path.join('parsed', 'textfile1_functions.txt')
        return os.path.join(ROOT, SALAMI_LABELS, str(filename), text)
    text = os.path.join('parsed', 'textfile2_functions.txt')
    return os.path.join(ROOT, SALAMI_LABELS, str(filename), text)

In [5]:
salami['First_annotation'] = salami['File'].map(lambda filename: get_label_path(filename))
salami['Second_annotation'] = salami['File'].map(lambda filename: get_label_path(filename, False))

## Drop rows with no annotations

In [6]:
salami['First_exists'] = salami['First_annotation'].map(lambda path: os.path.exists(path))
salami['Second_exists'] = salami['Second_annotation'].map(lambda path: os.path.exists(path))
salami['Both_exist'] = salami.apply(lambda r: r.First_exists | r.Second_exists, axis=1)
print(f"Dataset size before removal: {salami.shape[0]}")
salami = salami[salami['Both_exist']]
salami = salami.drop(['Both_exist'], axis=1)
print(f"Dataset size after removal: {salami.shape[0]} (no labels available)")

Dataset size before removal: 468
Dataset size after removal: 439 (no labels available)


## Check label values

In [7]:
raw = salami.copy()
annotations = raw.apply(lambda r: load_annotations(r, False, salami=True), axis=1)
# Convert list of tuples to list of lists
raw["Labels"], raw["Sections"] = list(map(list, zip(*annotations)))

  return read_csv(**locals())


In [8]:
unique =  np.unique(np.concatenate(raw['Sections'].values).ravel())
unique

array(['&pause', 'Bridge', 'Chorus', 'Coda', 'End', 'Fade-out', 'Head',
       'Instrumental', 'Interlude', 'Intro', 'Main_Theme', 'Outro',
       'Pre-Chorus', 'Pre-Verse', 'Silence', 'Solo', 'Theme',
       'Transition', 'Verse', 'applause', 'banjo', 'break', 'build',
       'crowd_sounds', 'no_function', 'post-chorus', 'silence',
       'spoken_voice', 'stage_sounds', 'stage_speaking', 'variation',
       'voice'], dtype=object)

In [9]:
annotations = raw.apply(lambda r: load_annotations(r, salami=True), axis=1)
# Convert list of tuples to list of lists
salami['Labels'], salami['Sections'] = list(map(list, zip(*annotations)))
# Filter out songs with very few song sections
salami['Size'] = salami['Sections'].map(lambda section: section.shape[0] > 2)
salami = salami[salami['Size']]
salami = salami.drop(['Size'], axis=1)
salami.head(1)

Unnamed: 0,File,Song_duration,Song_title,Artist,Format,First_annotation,Second_annotation,First_exists,Second_exists,Labels,Sections
0,956,320.0,Revolution,Burden_Hand,mp3,/Users/admin/Downloads/fypdataset/notebooks/02...,/Users/admin/Downloads/fypdataset/notebooks/02...,True,True,"[28.746303854, 49.357959183, 91.03056689299999...","[intro, verse, chorus, transition, verse, chor..."


In [10]:
unique =  np.unique(np.concatenate(salami['Sections'].values).ravel())
unique

array(['break', 'bridge', 'chorus', 'intro', 'live', 'outro',
       'solo/instrumental', 'transition', 'verse'], dtype=object)

## Save annotations

In [11]:
salami = salami.drop(['First_annotation', 'Second_annotation', 'First_exists', 'Second_exists'], axis=1)
salami.to_pickle(os.path.join(ROOT, ANNOTATIONS_DIR, 'salami.p'))