In [91]:
import pandas as pd
import numpy as np
import altair as alt
import json
import krippendorff
alt.data_transformers.enable('default', max_rows=10000)

DataTransformerRegistry.enable('default')

# Exploration of human-made annotations 
## Experiment description
This notebook provides an overview of the annotations collected in our crowdsourcing experiment. The dataset is located at `data/raw/annotation_results.csv`. A list of the lyrics pairs that were presented to the participants is also included in the same folder at `data/raw/lyrics_pairs.csv`.

In the experiment, we collected 8325 annotations for 2775 pairs of song lyrics. Each pair was assigned to three participants that were randomly chosen from a pool of 63. The participants were asked to estimate the similarity of the two lyrics using a 6-point Likert scale defined as: 

- 0: "Completamente diferente" (Completely different)
- 1: "Apenas existe similitud" (Barely any similarity) 
- 2: "Poca similitud" (Little similarity)
- 3: "Similitud básica" (Basic similarity)
- 4: "Similitud notable/Faltan detalles" (Notable Similarity / Missing Details)
- 5: "Similitud sobresaliente" (Outstanding similarity)


## Dataset loading and preprocessing

First, we load the dataset and the lyrics pairs from the Excel file and convert them to a JSON file. 

In [92]:
df = pd.read_excel('../data/raw/DetailedResults.xlsx', sheet_name='Sheet1')

In [93]:
# convert dataframe to json object
data = json.loads(df.to_json(orient='records'))

In [94]:
# 
for row in data:
    result_str = row['Result']
    result_dict = json.loads(result_str)
    row['Result'] = result_dict

    song_pair_info_str = row['SongPairInfo']
    song_pair_info_dict = json.loads(song_pair_info_str)
    row['SongPairInfo'] = song_pair_info_dict

In [95]:
# Now save data to json file
with open('../data/raw/detail_results.json', 'w') as f:
    # save formatted
    json.dump(data, f, indent=4)

In [96]:
# Keep only the fields we need
clean_data = []
for item in data:
    new_item = {
        'annotator_id': item['JobMemberId'],
        'value': item['Result']['LikertRating']['value'],
        'sim_rating': int(item['Result']['LikertRating']['key']),
        'id1': item['SongPairInfo']['id_1'],
        'id2': item['SongPairInfo']['id_2']
    }
    clean_data.append(new_item) 

# Save to json file
with open('../data/processed/clean_detail_results.json', 'w') as f:
    # save formatted
    json.dump(clean_data, f, indent=4)

## Data Processing
In this part, we will process the data to obtain high-quality annotations. We define a high-quality annotation as an annotation in which at least 2/3 of the annotators agree on the same score for a given pair of lyrics. In order to increase the quality of our data, if only 2/3 of the annotators agree on the same score, the third one has to be within 1 point of the other two (this is, within the `[-1, 1]` interval from the mode).

In [97]:
# First we load the clean data from the json file into a dataframe
df = pd.read_json('../data/processed/clean_detail_results.json')

# Set a unique string as the pair id 
df['pair_id'] = df['id1'].astype('str') + '_' + df['id2'].astype('str')

In [98]:
def print_units_and_annotators(df):
    unit_n = len(df['pair_id'].unique())
    annotator_n = len(df['annotator_id'].unique())
    print(f'Number of units: {unit_n}')
    print(f'Number of annotators: {annotator_n}')

print_units_and_annotators(df)

Number of units: 2775
Number of annotators: 63


In [99]:
def strict_criterium(row):
    if (len(row['sim_rating'].unique()) > 2):
        # Complete disagreement, discard
        return False
    elif (len(row['sim_rating'].unique()) == 1):
        # Complete agreement, keep
        return True
    else:
        # Partial agreement. Check if the rating from the disagreeing annotator falls within
        # the +/- 1 range of the other two
        # Get the mode
        mode = row['sim_rating'].mode()[0]
        if mode == 0:
            # In order to label a pair as completely dissimilar, all annotators must agree
            return False
        if mode > 0 and 0 in row['sim_rating'].unique():
            # And likewise, to label a pair as similar, all annotators must have given a positive rating
            return False
        # Get the value that is not the mode
        other_val = row['sim_rating'].unique()[row['sim_rating'].unique() != mode][0]
        # Check if the other value is within the +/- 1 range of the mode
        if (other_val in [mode-1, mode+1]):
            return True
        else:
            return False

# Keep only those pairs in which two or more annotators gave the same rating
filt_df = df.groupby('pair_id').filter(strict_criterium)

In [100]:
# Your DataFrame and groupby operation (assuming filt_df is your DataFrame)
grouped_sim_ratings = filt_df.groupby('pair_id')['sim_rating'].value_counts()

# Get unique pair_ids
unique_pair_ids = filt_df['pair_id'].unique()

# Randomly sample 5 unique pair_ids
sampled_pair_ids = pd.Series(unique_pair_ids).sample(4, random_state=42).tolist()

# Filter the original DataFrame to include only the sampled pair_ids
sampled_df = filt_df[filt_df['pair_id'].isin(unique_pair_ids)]

# Group the sampled_df by pair_id and display the 'sim_rating' values for each row in the group
sampled_grouped_sim_ratings = sampled_df.groupby('pair_id')['sim_rating']

# Display the sim_rating values for each group
for pair_id, group in sampled_grouped_sim_ratings:
    print(f"Pair ID: {pair_id}")
    print("Sim Ratings:")
    print(group)
    print()

Pair ID: 10052_10842
Sim Ratings:
1232    2
4190    1
7934    1
Name: sim_rating, dtype: int64

Pair ID: 10052_11142
Sim Ratings:
3062    1
6069    1
6145    1
Name: sim_rating, dtype: int64

Pair ID: 10052_21856
Sim Ratings:
4989    1
7615    1
8010    1
Name: sim_rating, dtype: int64

Pair ID: 10052_24450
Sim Ratings:
583     1
725     2
3432    2
Name: sim_rating, dtype: int64

Pair ID: 10052_25047
Sim Ratings:
1857    0
2397    0
5938    0
Name: sim_rating, dtype: int64

Pair ID: 10052_29590
Sim Ratings:
4758    0
6077    0
7574    0
Name: sim_rating, dtype: int64

Pair ID: 10842_11068
Sim Ratings:
2529    2
5750    1
7247    1
Name: sim_rating, dtype: int64

Pair ID: 10842_12476
Sim Ratings:
6285    0
7141    0
7337    0
Name: sim_rating, dtype: int64

Pair ID: 10842_29590
Sim Ratings:
4803    0
5079    0
6195    0
Name: sim_rating, dtype: int64

Pair ID: 10842_34125
Sim Ratings:
4049    0
5008    0
7353    0
Name: sim_rating, dtype: int64

Pair ID: 10842_38000
Sim Ratings:
6493  

In [101]:
grouped_sim_ratings

pair_id      sim_rating
10052_10842  1             2
             2             1
10052_11142  1             3
10052_21856  1             3
10052_24450  2             2
                          ..
9975_22822   3             1
9975_24450   3             2
             2             1
9975_34125   0             3
9975_6632    0             3
Name: sim_rating, Length: 937, dtype: int64

In [102]:
random_sample = grouped_sim_ratings.sample(n=10, random_state=42, replace=True, axis=0)
random_sample.shape

(10,)

In [103]:
# Sample 10 indices and show all rows for those indices
random_indices = grouped_sim_ratings.index.get_level_values(0).unique().sample(n=10, random_state=42)
for index in random_indices:
    rows = filt_df[(filt_df['pair_id'] == index)]
    print(rows)

AttributeError: 'Index' object has no attribute 'sample'

In [None]:
# This dataframe only contains high-quality annotations. We can test this using Krippendorff's alpha


## Replicating 0.66

In [133]:
def filter_fun_66(x):
    """
    This function implements the 66% filter. It returns True if the pair has at least 2 coincident ratings
    when rating is larger than 0, and 3 coincident ratings when rating is 0.
    """
    if (len(x['sim_rating'].unique()) > 2):
        # Complete disagreement, discard
        return False
    elif (len(x['sim_rating'].unique()) == 1):
        # Complete agreement, keep
        return True
    else:
        # Partial agreement. Check if the mode is greater than 0
        mode = x['sim_rating'].mode()[0]
        if mode == 0:
            return False
        # Finally, check if the third value is within the [-1, 1] range of the mode
        other_val = x['sim_rating'].unique()[x['sim_rating'].unique() != mode][0]
        if (other_val in [mode-1, mode+1]):
            return True
        else:
            return False
        
# Keep only those pairs in which two or more annotators gave the same rating
filt_df = df.groupby('pair_id').filter(filter_fun_66)

In [134]:
annotations_df, ratings = calculate_krippendorff_alpha(filt_df)

0.5501199925347022


## Krippendorff's alpha
Krippendorff's alpha is a commonly used measure of inter-rater reliability for nominal data. It provides a way to assess the degree of correspondence between the values assigned to a set of items by different raters. Unlike other measures, Krippendorff's alpha is a generalization of Fleiss' kappa that can be applied to more than two raters.

The alpha score ranges from 0 to 1, with 0 indicating no agreement and 1 indicating complete agreement. The closer the score is to 1, the better the agreement between raters. As such, it's often used to evaluate the consistency and reliability of data obtained from multiple sources.

It's important to note that Krippendorff's alpha is a measure of the reliability of the data, not of the raters themselves. This means it primarily assesses the consistency of the values assigned to items by the raters, rather than the quality of the raters or their individual abilities.

In [104]:
def calculate_krippendorff_alpha(df):
    """Calculate Krippendorff's alpha for the given annotations dataframe"""
    def convert_to_ratings(df):
        unit_n = len(df['pair_id'].unique())
        annotator_n = len(df['annotator_id'].unique())
        ratings = np.full((annotator_n, unit_n), np.nan)
        for i, row in df.iterrows():
            ratings[row['annotator_idx'], row['unit_idx']] = row['sim_rating']
        return ratings

    annotations_df = df.copy()

    annotator2idx = {annotator: idx for idx, annotator in enumerate(annotations_df['annotator_id'].unique())}
    annotations_df['annotator_idx'] = annotations_df['annotator_id'].map(annotator2idx)

    unit2idx = {unit: idx for idx, unit in enumerate(annotations_df['pair_id'].unique())}
    annotations_df['unit_idx'] = annotations_df['pair_id'].map(unit2idx)
        
    ratings = convert_to_ratings(annotations_df)
    print(krippendorff.alpha(ratings, level_of_measurement='ordinal'))
    return annotations_df, ratings

In [106]:
# Increase sim_rating by 1
filt_df['sim_rating'] = filt_df['sim_rating'] + 1

In [107]:
annotations_df, ratings = calculate_krippendorff_alpha(filt_df)

0.9010355393690141


In [132]:
filt_df.to_csv('../data/processed/filtered_detail_results.csv', index=False)

In [110]:
filt_df.shape

(2028, 6)

## Results

In [112]:
# Read annnotations from '../datasets/annotations_no_offenders.csv'
nof_df = pd.read_csv('../data/raw/annotations_no_offenders.csv')


In [114]:
_, _ = calculate_krippendorff_alpha(nof_df)

0.6626830941095202


In [120]:
no_df_set = set(nof_df['pair_id'].to_list())

In [121]:
filt_df_set = set(filt_df['pair_id'].to_list())

In [125]:
# Generate intersection and difference sets
intersection = no_df_set.intersection(filt_df_set)
difference = no_df_set.difference(filt_df_set)
print(len(filt_df_set), len(no_df_set), len(intersection), len(difference))

676 814 306 508


In [128]:
nof_df.groupby("pair_id").describe()

Unnamed: 0_level_0,id1,id1,id1,id1,id1,id1,id1,id1,id2,id2,...,annotator_id,annotator_id,sim_rating,sim_rating,sim_rating,sim_rating,sim_rating,sim_rating,sim_rating,sim_rating
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
pair_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10052_12476,2.0,10052.0,0.0,10052.0,10052.0,10052.0,10052.0,10052.0,2.0,12476.0,...,3748279.50,3748351.0,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
10052_14975,3.0,10052.0,0.0,10052.0,10052.0,10052.0,10052.0,10052.0,3.0,14975.0,...,3748025.00,3748065.0,3.0,2.666667,1.154701,2.0,2.0,2.0,3.0,4.0
10052_15357,3.0,10052.0,0.0,10052.0,10052.0,10052.0,10052.0,10052.0,3.0,15357.0,...,3748168.50,3748352.0,3.0,0.666667,0.577350,0.0,0.5,1.0,1.0,1.0
10052_21856,2.0,10052.0,0.0,10052.0,10052.0,10052.0,10052.0,10052.0,2.0,21856.0,...,3748349.50,3748352.0,2.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0
10052_24450,2.0,10052.0,0.0,10052.0,10052.0,10052.0,10052.0,10052.0,2.0,24450.0,...,3747849.00,3747985.0,2.0,2.000000,0.000000,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975_24450,2.0,9975.0,0.0,9975.0,9975.0,9975.0,9975.0,9975.0,2.0,24450.0,...,3748043.75,3748065.0,2.0,3.000000,0.000000,3.0,3.0,3.0,3.0,3.0
9975_34125,2.0,9975.0,0.0,9975.0,9975.0,9975.0,9975.0,9975.0,2.0,34125.0,...,3748370.50,3748380.0,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
9975_48594,2.0,9975.0,0.0,9975.0,9975.0,9975.0,9975.0,9975.0,2.0,48594.0,...,3748373.00,3748380.0,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
9975_6632,3.0,9975.0,0.0,9975.0,9975.0,9975.0,9975.0,9975.0,3.0,6632.0,...,3747707.00,3747985.0,3.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
