In [1]:
import pandas as pd
import altair as alt
import numpy as np
from utils import print_units_and_annotators, calculate_krippendorff_alpha
alt.data_transformers.enable('default', max_rows=None)

DataTransformerRegistry.enable('default')

# Exploration of human-made annotations 
## Experiment description
This notebook provides an overview of the annotations collected in our crowdsourcing experiment. The dataset is located at `data/raw/annotation_results.csv`. A list of the lyrics pairs that were presented to the participants is also included in the same folder at `data/raw/lyrics_pairs.csv`.

In the experiment, we collected 8325 annotations for 2775 pairs of song lyrics. Each pair was assigned to three participants that were randomly chosen from a pool of 63. The participants were asked to estimate the similarity of the two lyrics using a 6-point Likert scale defined as: 

- 0: "Completamente diferente" (Completely different)
- 1: "Apenas existe similitud" (Barely any similarity) 
- 2: "Poca similitud" (Little similarity)
- 3: "Similitud básica" (Basic similarity)
- 4: "Similitud notable/Faltan detalles" (Notable Similarity / Missing Details)
- 5: "Similitud sobresaliente" (Outstanding similarity)

In [102]:
# Read in the data
df = pd.read_csv('../data/processed/filtered_detailed_results.csv')

In [103]:
df

Unnamed: 0,annotator_id,value,sim_rating,id1,id2,pair_id
0,3747424,Apenas existe similitud,2,14251,76075,14251_76075
1,3747424,Completamente diferente,1,29094,33088,29094_33088
2,3747427,Poca similitud,3,37416,59899,37416_59899
3,3747424,Poca similitud,3,31841,3071,31841_3071
4,3747427,Apenas existe similitud,2,37741,35331,37741_35331
...,...,...,...,...,...,...
2023,3748376,Apenas existe similitud,2,48054,19354,48054_19354
2024,3748378,Poca similitud,3,69790,38000,69790_38000
2025,3748394,Similitud Sobresaliente,6,25639,16872,25639_16872
2026,3747469,Poca similitud,3,33088,24450,33088_24450


In [104]:
df['sim_rating'] = df['sim_rating'].astype(int) - 1

In [105]:
print_units_and_annotators(df)

Number of units: 676
Number of annotators: 58


## Plotting

First, plot the number of annotations made by each annotator


In [106]:
# Altair boxplot of similarity ratings
alt.Chart(df.reset_index()).mark_bar().encode(
    x=alt.X('annotator_id:N', sort='-y'),
    y=alt.Y('count()'),
    tooltip=['annotator_id', 'count()']
)

In [107]:
# First we load the clean data from the json file into a dataframe
df_orig = pd.read_json('../data/processed/clean_detailed_results.json')

# Set a unique string as the pair id 
df_orig['pair_id'] = df['id1'].astype('str') + '_' + df['id2'].astype('str')

Then, plot the similarity scores


In [115]:
alt.hconcat(
    alt.Chart(df_orig).mark_rect(
        color="steelblue"
    ).encode(
        x=alt.X('sim_rating:O'),
        y='count()'), 
    alt.Chart(df).mark_rect(
        color="firebrick"
    ).encode(
        x=alt.X('sim_rating:O'),
        y='count()')
    ).resolve_scale(y='shared')


In [134]:
df['sim_rating'].value_counts()

0    837
1    705
2    360
3     88
4     34
5      4
Name: sim_rating, dtype: int64

In [16]:
# Load the lyrics
lyrics_df = pd.read_csv('../data/raw/lyrics_pairs.csv')

In [8]:
# Group the pairs by pair_id
avg_ratings = df.groupby('pair_id')['sim_rating'].mean().reset_index()

In [9]:
# Split the pair_id on "_" in two columns, song_1 and song_2
avg_ratings[['song_1', 'song_2']] = avg_ratings['pair_id'].str.split('_', expand=True)
avg_ratings['song_1'] = avg_ratings['song_1'].astype(int)
avg_ratings['song_2'] = avg_ratings['song_2'].astype(int)

In [10]:
avg_ratings

Unnamed: 0,pair_id,sim_rating,song_1,song_2
0,10052_10842,1.333333,10052,10842
1,10052_11142,1.000000,10052,11142
2,10052_21856,1.000000,10052,21856
3,10052_24450,1.666667,10052,24450
4,10052_25047,0.000000,10052,25047
...,...,...,...,...
671,8863_55236,1.333333,8863,55236
672,9975_22822,2.333333,9975,22822
673,9975_24450,2.666667,9975,24450
674,9975_34125,0.000000,9975,34125


Now depict average similarity scores by pair.

In [11]:
# Create heatmap of pair similarity
alt.Chart(avg_ratings).mark_rect().encode(
    x=alt.X('song_1:O', axis=alt.Axis(title='Song 1')),
    y=alt.Y('song_2:O', axis=alt.Axis(title='Song 2', )),
    color=alt.Color('sim_rating:Q', scale=alt.Scale(scheme='viridis'), title='avg. rating')
    ).properties(
        height=400,
        width=400,
        title='Average similarity ratings'
    ).configure_axis(
            labelFontSize=5,
    )

Show song length association (to conform the pairs)


In [12]:
# Load the lyrics TSV
lyrics_df = pd.read_csv('../data/raw/full_lyrics.tsv', sep='\t')


In [13]:
lyrics_df['length'] = lyrics_df['text'].str.split().str.len()

In [14]:
lyrics_df.set_index('song_id', inplace=True)

In [15]:
# Show histogram of song lengths
alt.Chart(lyrics_df).mark_bar().encode(
    alt.X('length:Q', axis=alt.Axis(title='length (in words)'), bin=alt.Bin(maxbins=20)),
    y=alt.Y('count()', axis=alt.Axis(title='Number of items')),
)

In [19]:
# For each pair in avg_ratings, add length info for song_1. Save in a new column length_1.
avg_ratings['length_1'] = avg_ratings['song_1'].map(lyrics_df['length'])
avg_ratings['length_2'] = avg_ratings['song_2'].map(lyrics_df['length'])

In [20]:
# Create a scatterplot of song lengths
alt.Chart(avg_ratings).mark_circle().encode(
    x=alt.X('length_1:Q', axis=alt.Axis(title='Length of song 1 (in words)')),
    y=alt.Y('length_2:Q', axis=alt.Axis(title='Length of song 2 (in words)')),
    color=alt.Color('sim_rating:Q', scale=alt.Scale(scheme='viridis'), title='avg. rating')
    ).properties(
        height=400,
        width=400,
        title='Average similarity ratings'
    ).configure_axis(
            labelFontSize=5,
    )
    

In [21]:
avg_ratings

Unnamed: 0,pair_id,sim_rating,song_1,song_2,length_1,length_2
0,10052_10842,1.333333,10052,10842,115,31
1,10052_11142,1.000000,10052,11142,115,105
2,10052_21856,1.000000,10052,21856,115,78
3,10052_24450,1.666667,10052,24450,115,108
4,10052_25047,0.000000,10052,25047,115,49
...,...,...,...,...,...,...
671,8863_55236,1.333333,8863,55236,64,61
672,9975_22822,2.333333,9975,22822,139,78
673,9975_24450,2.666667,9975,24450,139,108
674,9975_34125,0.000000,9975,34125,139,17


In [22]:
# Save avg_ratings to a CSV in the processed data folder
avg_ratings.to_csv('../data/processed/avg_ratings.csv', index=False)

In [39]:
 # Create a heatmap of avg ratings per song length

base = alt.Chart(avg_ratings).encode(
    x=alt.X('length_1:Q', 
        axis=alt.Axis(title='Length of song 1 (in words)'), 
        bin=alt.Bin(maxbins=8)),
    y=alt.Y('length_2:Q', 
        axis=alt.Axis(title='Length of song 2 (in words)'), 
        bin=alt.Bin(maxbins=8)
    ))

heatmap = base.mark_circle().encode(
    color=alt.Color('mean(sim_rating)', 
                    scale=alt.Scale(scheme='viridis'), 
                    title='mean avg. rating'),
    size='count()'
    )

text = base.mark_text(baseline='middle', size=10).encode(
    text=alt.Text('count()'),
    color=alt.value('white')
)

(heatmap + text).properties(
        height=400,
        width=400,
        title='Average similarity ratings'
    ).configure_axis(
        labelFontSize=5,
    )

In [24]:
avg_ratings['length_sum'] = avg_ratings['length_1'] + avg_ratings['length_2']

In [25]:
# Plot lyric length vs. avg rating
alt.Chart(avg_ratings).mark_circle().encode(
    x=alt.X('length_sum:Q', axis=alt.Axis(title='Length of song 1 + length of song 2 (in words)')),
    y=alt.Y('sim_rating:Q', axis=alt.Axis(title='Average similarity rating')),
    ).properties(
        height=400,
        width=400,
        title='Average similarity ratings'
    ).configure_axis(
            labelFontSize=5,
    )


In [48]:
#Bin avg_ratings by length_sum
avg_ratings['length_sum_bin'] = pd.cut(avg_ratings['length_sum'], bins=8)
avg_ratings['length_sum_bin'] = avg_ratings['length_sum_bin'].astype(str)


In [49]:
avg_ratings['length_sum_bin'].value_counts().sort_index()

(112.0, 145.0]    126
(12.736, 46.0]     17
(145.0, 178.0]    171
(178.0, 211.0]    128
(211.0, 244.0]     50
(244.0, 277.0]     11
(46.0, 79.0]       47
(79.0, 112.0]     126
Name: length_sum_bin, dtype: int64

In [56]:
avg_ratings.groupby('length_sum_bin')['sim_rating'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
length_sum_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(112.0, 145.0]",126.0,0.753968,0.750915,0.0,0.0,1.0,1.333333,3.333333
"(12.736, 46.0]",17.0,0.156863,0.646762,0.0,0.0,0.0,0.0,2.666667
"(145.0, 178.0]",171.0,1.1423,0.884077,0.0,0.5,1.333333,1.333333,4.333333
"(178.0, 211.0]",128.0,1.46875,0.885815,0.0,1.0,1.333333,1.666667,4.666667
"(211.0, 244.0]",50.0,1.513333,0.83084,0.0,1.0,1.333333,1.666667,4.333333
"(244.0, 277.0]",11.0,1.575758,0.559581,1.0,1.166667,1.333333,1.833333,2.666667
"(46.0, 79.0]",47.0,0.049645,0.240558,0.0,0.0,0.0,0.0,1.333333
"(79.0, 112.0]",126.0,0.306878,0.699972,0.0,0.0,0.0,0.0,4.0


In [95]:

# Plot lyric length vs. avg rating
error_bars = alt.Chart(avg_ratings).mark_errorbar(ticks=True, extent="ci").encode(
    x=alt.X('length_sum_bin:N', 
            axis=alt.Axis(title='Length of song 1 + length of song 2 (in words)')),
    y=alt.Y('sim_rating:Q', 
            axis=alt.Axis(title='Average similarity rating')),
    )


points = alt.Chart(avg_ratings).mark_circle().encode(
        x=alt.X('length_sum_bin:N'),
        y=alt.Y('sim_rating:Q', aggregate="mean"),
        color=alt.value('black'),
)

line = alt.Chart(avg_ratings).mark_line().encode(
        x=alt.X('length_sum_bin:N'),
        y=alt.Y('sim_rating:Q', aggregate="mean"),
        color=alt.value('black'),
        opacity=alt.value(1),
        size=alt.value(1)
)

(error_bars + points + line).properties(
        height=400,
        width=400,
        title='Average similarity ratings'
    ).configure_axis(
            labelFontSize=5,
    )


In [83]:
# Plot lyric length vs. avg rating
error_band = alt.Chart(avg_ratings).mark_errorband(
        extent="ci", borders=True).encode(
                                        x=alt.X('length_sum_bin:N', 
                                        axis=alt.Axis(
                                                title='Length of song 1 + length of song 2 (in words)')),
                                        y=alt.Y('sim_rating:Q', 
                                        axis=alt.Axis(
                                                title='Average similarity rating'))
)


line = alt.Chart(avg_ratings).mark_line().encode(
        x=alt.X('length_sum_bin:N'),
        y=alt.Y('sim_rating:Q', aggregate="mean"),
        color=alt.value('black'),
        opacity=alt.value(0.5)
)


(error_band + line).properties(
        height=400,
        width=400,
        title='Average similarity ratings',
    ).configure_axis(
            labelFontSize=5,
    )
