In [1]:
import pandas as pd

We have created a 20% overlap for the samples to be labeled to assess the inter-rated agreement. Since annotator vary for each label duplicate pair, we can't use the sklearn's solution as it assumes that raters are the same in each case. Averaging the sklearn's  cohen_kappa_score along the pairs is not a viable solution since if you only have one data point per rater pair, the Cohen's kappa score calculation will result in a warning because it expects more variation in the data (or will return 0 if values are different).

We have calculated the Cohen's kappa for the dataset with the (wrong) assumption that there are only two annotators.
Further we improved the calculation with averaging the pairs, for that we have used this [solution](https://towardsdatascience.com/inter-annotator-agreement-2f46c6d37bf3) instead.

In [2]:
# https://towardsdatascience.com/inter-annotator-agreement-2f46c6d37bf3
def cohen_kappa(ann1, ann2):
    """Computes Cohen kappa for pair-wise annotators.
    :param ann1: annotations provided by first annotator
    :type ann1: list
    :param ann2: annotations provided by second annotator
    :type ann2: list
    :rtype: float
    :return: Cohen kappa statistic
    """
    count = 0
    for an1, an2 in zip(ann1, ann2):
        if an1 == an2:
            count += 1
    A = count / len(ann1)  # observed agreement A (Po)

    uniq = set(ann1 + ann2)
    E = 0  # expected agreement E (Pe)
    for item in uniq:
        cnt1 = ann1.count(item)
        cnt2 = ann2.count(item)
        count = ((cnt1 / len(ann1)) * (cnt2 / len(ann2)))
        E += count

    return round((A - E) / (1 - E), 4)

# Loading and preprocessing the data

In [191]:
df_labels = pd.read_json('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/labels_ready.json')

In [202]:
def extract_emotion(info):
    for classification in info['classifications']:
        if classification['title'] == '–ß–∏ –ø—Ä–∏—Å—É—Ç–Ω—è –≤ —Ç–µ–∫—Å—Ç—ñ –µ–º–æ—Ü—ñ–π–Ω–∞ –æ—Ü—ñ–Ω–∫–∞ —É–∫—Ä–∞—ó–Ω—Ü—ñ–≤?':
            return classification['answer']['title']
    return '–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è'

In [None]:
def extract_emotion(info):
    for classification in info['classifications']:
        if classification['title'] == '–ß–∏ –ø—Ä–∏—Å—É—Ç–Ω—è –≤ —Ç–µ–∫—Å—Ç—ñ –µ–º–æ—Ü—ñ–π–Ω–∞ –æ—Ü—ñ–Ω–∫–∞ —É–∫—Ä–∞—ó–Ω—Ü—ñ–≤?':
            return classification['answer']['title']
    return '–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è'

In [207]:
def extract_dehumanization(info):
    for classification in info['classifications']:
        if classification['title'] == '–ß–∏ –ø—Ä–∏—Ä—ñ–≤–Ω—é—é—Ç—å—Å—è —É–∫—Ä–∞—ó–Ω—Ü—ñ –¥–æ –Ω–µ—ñ—Å—Ç–æ—Ç, —Ç–≤–∞—Ä–∏–Ω —á–∏ –ª—é–¥–µ–π, –ø–æ–∑–±–∞–≤–ª–µ–Ω–∏—Ö –ª—é–¥—Å—å–∫–∏—Ö —Ä–∏—Å (—á–∞—Å—Ç–∫–æ–≤–æ –∞–±–æ –ø–æ–≤–Ω—ñ—Å—Ç—é)?':
            return classification['answer']['title']
    return '–Ω—ñ'

In [210]:
def extract_mention(info):
    for classification in info['classifications']:
        if classification['title'] == '–ß–∏ –∑–≥–∞–¥—É—é—Ç—å—Å—è –≤ —Ç–µ–∫—Å—Ç—ñ —É–∫—Ä–∞—ó–Ω—Ü—ñ –∞–±–æ —â–æ—Å—å —É–∫—Ä–∞—ó–Ω—Å—å–∫–µ?':
            return classification['answer']['title']
    return None

In [208]:
df_labels["Dehumanization"] = df_labels.Label.apply(lambda x: extract_dehumanization(x))

In [205]:
df_labels["Emotion"] = df_labels.Label.apply(lambda x: extract_emotion(x))

In [212]:
df_labels["Mention"] = df_labels.Label.apply(lambda x: extract_mention(x))

In [219]:
#some Labels are empty, dropping those
df_labels[df_labels["Mention"].isna()]['Label'][369]

{'objects': [], 'classifications': [], 'relationships': []}

In [220]:
df_labels = df_labels[~df_labels["Mention"].isna()]

In [222]:
len(df_labels)

4250

In [192]:
# import json
# def extract_class_pairs(json_lst):
#     pairs = []
#     for json_str in json_lst:
#         question_title = json_str['title']
#         answer_title = json_str['answer']['title']
#         pairs.append([question_title,answer_title])
#     return pairs

In [72]:
# df_labels['pairs'] = df_labels['Label'].apply(lambda x: extract_class_pairs(x['classifications']))

In [227]:
df = df_labels[['Created By', 'Mention', 'Dehumanization', 'Emotion', 'External ID']]

In [228]:
value_counts = df['External ID'].value_counts()
unique_rows = df[df['External ID'].isin(value_counts[value_counts == 1].index)]
df.drop(unique_rows.index, inplace=True)
df.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(unique_rows.index, inplace=True)


In [226]:
# for i in range(3):
#     col_name = 'pair{}'.format(i+1)
#     df[col_name] = df['pairs'].apply(lambda x: x[i] if len(x) > i else None)

In [52]:
# df = df[~df['pair1'].isna()]
# df.reset_index(inplace=True, drop=True)

In [109]:
# # text2 = df['pair2'].iloc[9]
# text2 = "['–ß–∏ –ø—Ä–∏—Ä—ñ–≤–Ω—é—é—Ç—å—Å—è —É–∫—Ä–∞—ó–Ω—Ü—ñ –¥–æ –Ω–µ—ñ—Å—Ç–æ—Ç, —Ç–≤–∞—Ä–∏–Ω —á–∏ –ª—é–¥–µ–π, –ø–æ–∑–±–∞–≤–ª–µ–Ω–∏—Ö –ª—é–¥—Å—å–∫–∏—Ö —Ä–∏—Å (—á–∞—Å—Ç–∫–æ–≤–æ –∞–±–æ –ø–æ–≤–Ω—ñ—Å—Ç—é)?', '–Ω—ñ']"

In [162]:
# # text3 = df['pair3'].iloc[24]
# text3 = "['–ß–∏ –ø—Ä–∏—Å—É—Ç–Ω—è –≤ —Ç–µ–∫—Å—Ç—ñ –µ–º–æ—Ü—ñ–π–Ω–∞ –æ—Ü—ñ–Ω–∫–∞ —É–∫—Ä–∞—ó–Ω—Ü—ñ–≤?', '–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è']"

In [164]:
# # text5 = df['pair3'].iloc[23]
# text5 = "['–ß–∏ –ø—Ä–∏—Å—É—Ç–Ω—è –≤ —Ç–µ–∫—Å—Ç—ñ –µ–º–æ—Ü—ñ–π–Ω–∞ –æ—Ü—ñ–Ω–∫–∞ —É–∫—Ä–∞—ó–Ω—Ü—ñ–≤?', '–Ω–µ –º–æ–∂—É –≤–∏–∑–Ω–∞—á–∏—Ç–∏—Å—å –∑ –ø—Ä–∞–≤–∏–ª—å–Ω–æ—é –≤—ñ–¥–ø–æ–≤—ñ–¥–¥—é']"

In [229]:
df

Unnamed: 0,Created By,Mention,Dehumanization,Emotion,External ID
0,snizannabotvin@gmail.com,—Ç–∞–∫,—Ç–∞–∫,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",row_3637.txt
1,snizannabotvin@gmail.com,—Ç–∞–∫,—Ç–∞–∫,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",row_3626.txt
2,snizannabotvin@gmail.com,–Ω—ñ,–Ω—ñ,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",row_3625.txt
3,snizannabotvin@gmail.com,–Ω—ñ,–Ω—ñ,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",row_3468.txt
4,snizannabotvin@gmail.com,–Ω—ñ,–Ω—ñ,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",row_3467.txt
...,...,...,...,...,...
1403,eugene.1martynyuk@gmail.com,—Ç–∞–∫,—Ç–∞–∫,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",row_9.txt
1404,eugene.1martynyuk@gmail.com,–Ω—ñ,–Ω—ñ,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",row_666.txt
1405,eugene.1martynyuk@gmail.com,—Ç–∞–∫,—Ç–∞–∫,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",row_21.txt
1406,eugene.1martynyuk@gmail.com,—Ç–∞–∫,—Ç–∞–∫,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",row_660.txt


In [184]:
# def replace_with(cell, replace_list):
#     if cell is None:
#         return replace_list
#     else:
#         return cell

In [59]:
# df["Dehumanization"] = df.pair2.apply(lambda x: replace_with(x, text2)[1])

In [60]:
# df["Emotion"] = df.pair3.apply(lambda x: replace_with(x, text3)[1])

In [61]:
# df["Mention"] = df['pair1'].apply(lambda x: x[1])

In [230]:
df = df.sort_values(by='External ID')
df.reset_index(inplace=True, drop=True)

# First attempt to calculate with the assumption that annotators are same

In [231]:
df['Rater'] = ['Rater1' if i % 2 == 0 else 'Rater2' for i in range(len(df))]

In [232]:
from sklearn.metrics import cohen_kappa_score

# define a function to calculate Cohen's kappa for a given column
def calculate_kappa(column_name):
    rater1_labels = df[column_name].loc[df['Rater'] == 'Rater1'].tolist()
    rater2_labels = df[column_name].loc[df['Rater'] == 'Rater2'].tolist()
    kappa = cohen_kappa_score(rater1_labels, rater2_labels)
    return kappa

In [233]:
# calculate Cohen's kappa for the 'Dehumanization' column
dehumanization_kappa = calculate_kappa('Dehumanization')
print("Cohen's kappa for Dehumanization: {:.2f}".format(dehumanization_kappa))

# repeat the above for the 'Emotion' and 'Mention' columns
emotion_kappa = calculate_kappa('Emotion')
print("Cohen's kappa for Emotion: {:.2f}".format(emotion_kappa))

mention_kappa = calculate_kappa('Mention')
print("Cohen's kappa for Mention: {:.2f}".format(mention_kappa))

Cohen's kappa for Dehumanization: 0.69
Cohen's kappa for Emotion: 0.70
Cohen's kappa for Mention: 0.92


# Correct calculation under the real conditions of different annotators

In [234]:
def calculate_cohens_kappa(df, col_name):
    # creating the data for question
    df_column = df[['External ID', 'Created By', col_name]]

    # identifying the overlapping samples and raters who labeled them
    overlapping_samples = df_column.groupby('External ID').filter(lambda x: len(x) > 1)
    unique_sample_ids = overlapping_samples['External ID'].unique()
    rater_pairs = []
    for sample_id in unique_sample_ids:
        raters = overlapping_samples[overlapping_samples['External ID'] == sample_id]['Created By'].tolist()
        rater_pairs.append(raters)

    # averaging the kappa scores for pairs
    kappa_scores = []
    for sample_id, rater_pair in zip(unique_sample_ids, rater_pairs):
        sample_data = overlapping_samples[overlapping_samples['External ID'] == sample_id]
        rater1_labels = sample_data[sample_data['Created By'] == rater_pair[0]][col_name].tolist()[0]
        rater2_labels = sample_data[sample_data['Created By'] == rater_pair[1]][col_name].tolist()[0]
        kappa = cohen_kappa(rater1_labels, rater2_labels)
        kappa_scores.append(kappa)
    mean_kappa_score = sum(kappa_scores) / len(kappa_scores)

    return mean_kappa_score

In [235]:
calculate_cohens_kappa(df, 'Dehumanization')

0.8462920454545458

In [236]:
calculate_cohens_kappa(df, 'Mention')

0.968306818181818

In [237]:
calculate_cohens_kappa(df, 'Emotion')

0.8560982954545455

# Labels export

In [238]:
import os
import pandas as pd

directory_path = "/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/dataset_shuffled"
file_list = []
for filename in os.listdir(directory_path):
    if filename.endswith(".txt"):
        with open(os.path.join(directory_path, filename), "r") as file:
            file_content = file.read()
        file_list.append({"text": file_content, "file_name": filename})

df_datarows = pd.DataFrame(file_list)

In [239]:
df_datarows.rename(columns={'file_name':'External ID'}, inplace=True)
df_datarows

Unnamed: 0,text,External ID
0,"–ö–∞–∫ –∏–∑–≤–µ—Å—Ç–Ω–æ, —Å–≤–∏–Ω—å—è –≤–µ–∑–¥–µ –≥—Ä—è–∑—å –Ω–∞–π–¥–µ—Ç, –∏ —É–∫—Ä...",row_1281.txt
1,"–¢–æ–≥–¥–∞ –∫–∞–∫ –º–Ω–æ–≥–∏–µ —É–∫—Ä–∞–∏–Ω—Ü—ã, –Ω–∞–æ–±–æ—Ä–æ—Ç, —Å–ª–µ–ø–æ –≤–µ—Ä...",row_2950.txt
2,"–û —Ç–∞–∫–æ–º —â–µ–¥—Ä–æ–º –∏ –º–∏—Ä–Ω–æ–º —Å–æ—Å–µ–¥–µ, –∫–∞–∫ –†–æ—Å—Å–∏—è, –º–æ...",row_3496.txt
3,üá∫üá¶‚ùå–î–µ–Ω–∞—Ü–∏—Ñ–∏–∫–∞—Ü–∏—è –ø–æ-–≤–∏–Ω–Ω–∏—Ü–∫–∏ –¢–∏–≤—Ä–∏–≤—Å–∫–∏–π —Å–µ–ª—å—Å...,row_2788.txt
4,–£–∫—Ä–æ–Ω–∞—Ü–∏—Å—Ç—ã –æ–±—ä—è–≤–∏–ª–∏ –≤–æ–π–Ω—É –ü—É—à–∫–∏–Ω—É.,row_2944.txt
...,...,...
3638,"¬´–Ø –≤–∞—Å, –±–ª@–¥–µ–π, –Ω–∞ —ç—Ç–æ—Ç –∫–æ—Ä–∞–±–ª—å —Ç—Ä–∏ –≥–æ–¥–∞ —Å–æ–±–∏—Ä...",row_2791.txt
3639,–ù–∞–≥—Ä–∞–∂–¥–µ–Ω –æ—Ä–¥–µ–Ω–æ–º –ú—É–∂–µ—Å—Ç–≤–∞ –∑–∞ –æ—Ç–≤–∞–≥—É –∏ —Å–∞–º–æ–æ—Ç–≤...,row_474.txt
3640,üá∑üá∫–í —Ä–∞–π–æ–Ω–µ –Ω–∞—Å–µ–ª–µ–Ω–Ω–æ–≥–æ –ø—É–Ω–∫—Ç–∞ –ß–∞—Å–æ–≤ –Ø—Ä —É–Ω–∏—á—Ç–æ–∂...,row_2949.txt
3641,–£—á–∞—Å—Ç–Ω–∏–∫–∏ –∫–æ–Ω–∫—É—Ä—Å–∞ –∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–æ–π –∞—Ä–º–∏–∏ –Ω–∞ –∑–∞–º–µ–Ω...,row_1298.txt


In [240]:
df_datarows[df_datarows['External ID'] == 'row_3210.txt']

Unnamed: 0,text,External ID
2016,"""–ë–∞–Ω–¥–µ—Ä–æ–≤—Ü—ã –Ω–∞ –£–∫—Ä–∞–∏–Ω–µ —Å–µ–≥–æ–¥–Ω—è –≤–∑—è–ª–∏ –≤—Å–µ —Ö—É–¥—à–µ...",row_3210.txt


In [185]:
# df_labels = pd.read_json('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/labels_ready.json')
# df_labels['pairs'] = df_labels['Label'].apply(lambda x: extract_class_pairs(x['classifications']))
# df_final = df_labels.copy()

In [243]:
df_final=df_labels[['Emotion','Dehumanization', 'Mention', 'External ID', 'Created By']]

In [244]:
df_final

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By
0,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_3642.txt,kateryna.burovova@ucu.edu.ua
1,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_3641.txt,kateryna.burovova@ucu.edu.ua
2,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3640.txt,kateryna.burovova@ucu.edu.ua
3,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3639.txt,kateryna.burovova@ucu.edu.ua
4,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3637.txt,snizannabotvin@gmail.com
...,...,...,...,...,...
4279,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_9.txt,eugene.1martynyuk@gmail.com
4280,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_666.txt,eugene.1martynyuk@gmail.com
4281,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_21.txt,eugene.1martynyuk@gmail.com
4282,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_660.txt,eugene.1martynyuk@gmail.com


In [245]:
df_final["Emotion"].unique()

array(['–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è', '—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞',
       '–Ω–µ –º–æ–∂—É –≤–∏–∑–Ω–∞—á–∏—Ç–∏—Å—å –∑ –ø—Ä–∞–≤–∏–ª—å–Ω–æ—é –≤—ñ–¥–ø–æ–≤—ñ–¥–¥—é',
       '—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –ø–æ–∑–∏—Ç–∏–≤–Ω–∞'], dtype=object)

In [247]:
df_final["Mention"].unique()


array(['–Ω—ñ', '—Ç–∞–∫', '–Ω–µ –º–æ–∂—É –≤–∏–∑–Ω–∞—á–∏—Ç–∏—Å—å —ñ–∑ –≤—ñ–¥–ø–æ–≤—ñ–¥–¥—é'], dtype=object)

In [248]:
df_merged = pd.merge(df_final, df_datarows, on='External ID', how='left')

In [249]:
#bs check
df_merged[df_merged['Mention']=='–Ω—ñ']['Dehumanization'].unique()


array(['–Ω—ñ'], dtype=object)

In [250]:
#bs check
df_merged[df_merged['Mention']=='–Ω—ñ']['Emotion'].unique()

array(['–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è'], dtype=object)

In [251]:
df_merged

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text
0,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_3642.txt,kateryna.burovova@ucu.edu.ua,"‚ùóÔ∏è–û–¥–∏–Ω –∏–∑ –ø—Ä–æ—Ç–µ—Å—Ç—É—é—â–∏—Ö –ø–æ–≥–∏–±, –∑–∞—è–≤–∏–ª–∏ –≤ –ú–í–î –ë–µ..."
1,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_3641.txt,kateryna.burovova@ucu.edu.ua,–ü—Ä–µ–¥–ª–∞–≥–∞–µ–º –Ω–µ–º–Ω–æ–≥–æ –æ—Ç–≤–ª–µ—á—å—Å—è –∏ –ø–æ–∏–≥—Ä–∞—Ç—å –≤ –∏–≥—Ä—É...
2,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3640.txt,kateryna.burovova@ucu.edu.ua,‚ö°Ô∏èüá∫üá¶üá∑üá∫–í –ó–∞–ø–æ—Ä–æ–∂—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –í–°–£ —Å–∫–∞–ø–ª–∏–≤–∞—é—Ç –≤–æ–π...
3,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3639.txt,kateryna.burovova@ucu.edu.ua,"–£–∫—Ä–∞–∏–Ω–∞ –æ—Ç–∫—Ä—ã–ª–∞ –≥–ª–∞–∑–∞ –î–∞–ª–º–µ—Ä—Å—É, –∏ –æ–Ω –Ω–∞—á–∞–ª —Ä–∞—Å..."
4,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_3637.txt,snizannabotvin@gmail.com,–ö–æ—Ä—á–∏–Ω—Å–∫–∏–π - –¥–µ—à—ë–≤–∞—è –ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫–∞ –Ω–µ–æ–∫–æ–Ω–æ–≤—Å–∫–æ–≥–æ...
...,...,...,...,...,...,...
4245,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_9.txt,eugene.1martynyuk@gmail.com,üá∫üá¶12 000 —É–∫—Ä–æ–Ω–∞—Ü–∏—Å—Ç–æ–≤ –≥–æ—Ç–æ–≤—è—Ç –∞—Ç–∞–∫—É –Ω–∞ –ú–µ–ª–∏—Ç–æ–ø...
4246,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_666.txt,eugene.1martynyuk@gmail.com,–ù–∞—à —Ñ–∏–ª—å–º –Ω–µ–æ–∂–∏–¥–∞–Ω–Ω–æ –¥–ª—è –Ω–∞—Å —Å–∞–º–∏—Ö –ø–æ–ø–∞–ª –≤ —Ç–æ–ø...
4247,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_21.txt,eugene.1martynyuk@gmail.com,P.S. –£–∫—Ä–æ—Ä–µ–π—Ö –Ω–µ —Å–ø–∏–º. @rosich_rus
4248,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_660.txt,eugene.1martynyuk@gmail.com,"–ù–∞—à–∞ —Å –≤–∞–º–∏ –∑–∞–¥–∞—á–∞ –Ω–µ —Å–∫–∞—Ç—ã–≤–∞—Ç—å—Å—è –≤ —Ä–µ–∂–∏–º ""–∑—Ä–∞..."


At this stage we should choose whose labels should remain in the final dataset for those samples labeled by 2 labelers.

We have investigated a random portion of disagreements manually as only manual investigation can provide insights into potential biases and/or misunderstanding of guidelines.
We have also considered the annotator expertise and background that we were informed of.

In [252]:
df_labels['Created By'].unique()

array(['kateryna.burovova@ucu.edu.ua', 'snizannabotvin@gmail.com',
       'nazariy.melnychuk9@gmail.com', 'chennnakal@gmail.com',
       's.sterpul@icloud.com', 'mariana.scorp@gmail.com',
       'tutovadesign@gmail.com', 'yevhen.marchenko91@gmail.com',
       'eugene.1martynyuk@gmail.com'], dtype=object)

In [253]:
authors = df_labels['Created By'].unique().tolist()
rating = [1, 4, 7, 6, 8, 2, 5, 3, 9]

In [254]:
data = {'Created By': authors, 'rating': rating}
df_rating = pd.DataFrame(data)

In [255]:
df_rating

Unnamed: 0,Created By,rating
0,kateryna.burovova@ucu.edu.ua,1
1,snizannabotvin@gmail.com,4
2,nazariy.melnychuk9@gmail.com,7
3,chennnakal@gmail.com,6
4,s.sterpul@icloud.com,8
5,mariana.scorp@gmail.com,2
6,tutovadesign@gmail.com,5
7,yevhen.marchenko91@gmail.com,3
8,eugene.1martynyuk@gmail.com,9


In [256]:
df_combined = df_merged.merge(df_rating, on='Created By')

In [257]:
df_combined = df_combined.sort_values(['External ID', 'rating'], ascending=True)

In [258]:
df_combined

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text,rating
1219,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_0.txt,snizannabotvin@gmail.com,"–í—Å–≤—è–∑–∏ —Å —ç—Ç–∏–º –Ω–µ–º–Ω–æ–≥–æ –ø–æ–ø—Ä–∞–≤–ª—é –∫–æ–ª–ª–µ–≥ ‚§µÔ∏è ""–û–Ω–∏...",4
1218,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_1.txt,snizannabotvin@gmail.com,–õ–∏—Ç–µ—Ä–∞—Ç—É—Ä–Ω—ã–π –∫—Ä–∏—Ç–∏–∫ –ì–∞–ª–∏–Ω–∞ –Æ–∑–µ—Ñ–æ–≤–∏—á –æ –Ω–æ–≤–æ–º —Ä–æ...,4
1591,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_10.txt,snizannabotvin@gmail.com,–ü–æ—á–µ–º—É –Ω–∞ –±–∞–∑–∞—Ö –Ω–µ–æ–Ω–∞—Ü–∏—Å—Ç–æ–≤ —Å—Ç–æ—è—Ç —è–∑—ã—á–µ—Å–∫–∏–µ –∏—Å...,4
1198,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_100.txt,snizannabotvin@gmail.com,–ì—Ä—É–ø–ø–∞ –¥–æ–±—Ä–æ–≤–æ–ª—å—Ü–µ–≤-–º–µ–¥–∏–∫–æ–≤ –∏–∑ –ß–µ—á–µ–Ω—Å–∫–æ–π –†–µ—Å–ø—É...,4
3247,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,—Ç–∞–∫,row_1000.txt,tutovadesign@gmail.com,"–í–°–£—à–Ω–∏–∫–∏, –ø–µ—Ä–µ—Ö–æ–¥–∏—Ç–µ –Ω–∞ —Å—Ç–æ—Ä–æ–Ω—É –¥–æ–±—Ä–∞, —É –Ω–∞—Å —Ç...",5
...,...,...,...,...,...,...,...
3612,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",–Ω—ñ,—Ç–∞–∫,row_996.txt,tutovadesign@gmail.com,–ò –ø–æ–Ω–µ—Å–ª–∞—Å—å –º–∞–∑–µ–ø–∏–Ω—â–∏–Ω–æ-–ø–µ—Ç–ª—é—Ä–æ–≤—â–∏–Ω–æ-–±–∞–Ω–¥–µ—Ä–æ–≤—â...,5
4121,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",–Ω—ñ,—Ç–∞–∫,row_997.txt,yevhen.marchenko91@gmail.com,–ù–∞—à —Å–æ—Ä–∞—Ç–Ω–∏–∫ –ø–æ —Ä—É—Å—Å–∫–æ–º—É –¥–≤–∏–∂–µ–Ω–∏—é –ê–ª–µ–∫—Å–µ–π –°–µ–ª–∏...,3
4120,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",—Ç–∞–∫,—Ç–∞–∫,row_998.txt,yevhen.marchenko91@gmail.com,–•–æ—Ä–æ—à–µ–µ –≤–∏–¥–µ–æ –æ—Ç 4 –±—Ä–∏–≥–∞–¥—ã –ù–ú –õ–ù–† https://t.me...,3
3249,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_998.txt,tutovadesign@gmail.com,–•–æ—Ä–æ—à–µ–µ –≤–∏–¥–µ–æ –æ—Ç 4 –±—Ä–∏–≥–∞–¥—ã –ù–ú –õ–ù–† https://t.me...,5


In [259]:
df_cleaned = df_combined.drop_duplicates(subset='External ID', keep='first')

In [260]:
df_cleaned

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text,rating
1219,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_0.txt,snizannabotvin@gmail.com,"–í—Å–≤—è–∑–∏ —Å —ç—Ç–∏–º –Ω–µ–º–Ω–æ–≥–æ –ø–æ–ø—Ä–∞–≤–ª—é –∫–æ–ª–ª–µ–≥ ‚§µÔ∏è ""–û–Ω–∏...",4
1218,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_1.txt,snizannabotvin@gmail.com,–õ–∏—Ç–µ—Ä–∞—Ç—É—Ä–Ω—ã–π –∫—Ä–∏—Ç–∏–∫ –ì–∞–ª–∏–Ω–∞ –Æ–∑–µ—Ñ–æ–≤–∏—á –æ –Ω–æ–≤–æ–º —Ä–æ...,4
1591,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_10.txt,snizannabotvin@gmail.com,–ü–æ—á–µ–º—É –Ω–∞ –±–∞–∑–∞—Ö –Ω–µ–æ–Ω–∞—Ü–∏—Å—Ç–æ–≤ —Å—Ç–æ—è—Ç —è–∑—ã—á–µ—Å–∫–∏–µ –∏—Å...,4
1198,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_100.txt,snizannabotvin@gmail.com,–ì—Ä—É–ø–ø–∞ –¥–æ–±—Ä–æ–≤–æ–ª—å—Ü–µ–≤-–º–µ–¥–∏–∫–æ–≤ –∏–∑ –ß–µ—á–µ–Ω—Å–∫–æ–π –†–µ—Å–ø—É...,4
3247,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,—Ç–∞–∫,row_1000.txt,tutovadesign@gmail.com,"–í–°–£—à–Ω–∏–∫–∏, –ø–µ—Ä–µ—Ö–æ–¥–∏—Ç–µ –Ω–∞ —Å—Ç–æ—Ä–æ–Ω—É –¥–æ–±—Ä–∞, —É –Ω–∞—Å —Ç...",5
...,...,...,...,...,...,...,...
3613,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,—Ç–∞–∫,row_995.txt,tutovadesign@gmail.com,–£—Ç—Ä–µ–Ω–Ω–∏–π –±—Ä–∏—Ñ–∏–Ω–≥ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –†–æ—Å—Å–∏–∏: ‚ñ™Ô∏è —Ä–æ—Å—Å–∏–π...,5
3612,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",–Ω—ñ,—Ç–∞–∫,row_996.txt,tutovadesign@gmail.com,–ò –ø–æ–Ω–µ—Å–ª–∞—Å—å –º–∞–∑–µ–ø–∏–Ω—â–∏–Ω–æ-–ø–µ—Ç–ª—é—Ä–æ–≤—â–∏–Ω–æ-–±–∞–Ω–¥–µ—Ä–æ–≤—â...,5
4121,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",–Ω—ñ,—Ç–∞–∫,row_997.txt,yevhen.marchenko91@gmail.com,–ù–∞—à —Å–æ—Ä–∞—Ç–Ω–∏–∫ –ø–æ —Ä—É—Å—Å–∫–æ–º—É –¥–≤–∏–∂–µ–Ω–∏—é –ê–ª–µ–∫—Å–µ–π –°–µ–ª–∏...,3
4120,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",—Ç–∞–∫,—Ç–∞–∫,row_998.txt,yevhen.marchenko91@gmail.com,–•–æ—Ä–æ—à–µ–µ –≤–∏–¥–µ–æ –æ—Ç 4 –±—Ä–∏–≥–∞–¥—ã –ù–ú –õ–ù–† https://t.me...,3


Double-checking since it's crucial for our task

In [261]:
value_counts = df_combined['External ID'].value_counts()
unique_rows = df_combined[df_combined['External ID'].isin(value_counts[value_counts == 1].index)]
df_bs_check = df_combined.drop(unique_rows.index)

In [262]:
df_bs_check.sort_values(['External ID'])

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text,rating
4119,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_1005.txt,yevhen.marchenko91@gmail.com,"–û–Ω–æ –ø—Ä–æ–≤–∞–ª–∏–ª–æ—Å—å, –Ω–æ —É–∫—Ä–æ–≤–æ—è–∫–∏ —Ö–≤–∞–ª–∏–ª–∏—Å—å —Ç–µ–º, —á...",3
3242,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",–Ω—ñ,—Ç–∞–∫,row_1005.txt,tutovadesign@gmail.com,"–û–Ω–æ –ø—Ä–æ–≤–∞–ª–∏–ª–æ—Å—å, –Ω–æ —É–∫—Ä–æ–≤–æ—è–∫–∏ —Ö–≤–∞–ª–∏–ª–∏—Å—å —Ç–µ–º, —á...",5
4118,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_1006.txt,yevhen.marchenko91@gmail.com,–ó–∞–ø–∞–¥–Ω—ã–π –º–µ–º –æ —Å—Ç–µ–ø–µ–Ω–∏ –ø—Ä–∞–≤–¥–∏–≤–æ—Å—Ç–∏ –ø—Ä–æ–ø–∞–≥–∞–Ω–¥—ã ...,3
3241,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_1006.txt,tutovadesign@gmail.com,–ó–∞–ø–∞–¥–Ω—ã–π –º–µ–º –æ —Å—Ç–µ–ø–µ–Ω–∏ –ø—Ä–∞–≤–¥–∏–≤–æ—Å—Ç–∏ –ø—Ä–æ–ø–∞–≥–∞–Ω–¥—ã ...,5
4117,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_1008.txt,yevhen.marchenko91@gmail.com,–í–æ–π–Ω–∞ –¥–æ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ —É–∫—Ä–∞–∏–Ω—Ü–∞ ‚Äì —ç—Ç–æ –≤–æ–≤—Å–µ –Ω–µ –≤—ã...,3
...,...,...,...,...,...,...,...
2384,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_98.txt,nazariy.melnychuk9@gmail.com,–ë–ª–∞–≥–æ–¥–∞—Ä—è —Ç–∞–∫–æ–º—É –µ–¥–∏–Ω—Å—Ç–≤—É –º–æ–∂–Ω–æ —É–≤–µ—Ä–µ–Ω–Ω–æ –≥–∞—Ä–∞–Ω...,7
126,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_984.txt,kateryna.burovova@ucu.edu.ua,–≠—Ç–∞–ø—ã –ö–∏—Ç–∞–π—Å–∫–æ–π –∫–æ–º–ø–∞—Ä—Ç–∏–∏ –ø–æ –ø—É—Ç–∏ –∫ –º–∏—Ä–æ–≤–æ–π –≥–µ...,1
3673,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",–Ω—ñ,–Ω—ñ,row_984.txt,tutovadesign@gmail.com,–≠—Ç–∞–ø—ã –ö–∏—Ç–∞–π—Å–∫–æ–π –∫–æ–º–ø–∞—Ä—Ç–∏–∏ –ø–æ –ø—É—Ç–∏ –∫ –º–∏—Ä–æ–≤–æ–π –≥–µ...,5
4120,"–Ω—ñ, –æ—Ü—ñ–Ω–∫–∞ –Ω–µ –ø—Ä–∏—Å—É—Ç–Ω—è",—Ç–∞–∫,—Ç–∞–∫,row_998.txt,yevhen.marchenko91@gmail.com,–•–æ—Ä–æ—à–µ–µ –≤–∏–¥–µ–æ –æ—Ç 4 –±—Ä–∏–≥–∞–¥—ã –ù–ú –õ–ù–† https://t.me...,3


In [263]:
df_cleaned[df_cleaned['External ID']=='row_1128.txt']

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text,rating
507,"—Ç–∞–∫, –ø—Ä–∏—Å—É—Ç–Ω—è –Ω–µ–≥–∞—Ç–∏–≤–Ω–∞",—Ç–∞–∫,—Ç–∞–∫,row_1128.txt,snizannabotvin@gmail.com,–ü–æ –≤–∏—Ä—Ç—É–∞–ª—å–Ω—ã–º –ø–ª–∞–Ω–∞–º —É–∫—Ä–æ—Ä–µ–π—Ö–∞ –æ–Ω–∏ —É–∂–µ –≤–∑—è–ª–∏ ...,4


In [264]:
df_cleaned.to_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/final_labels.csv')