<a href="https://colab.research.google.com/github/jnqeras/ARC/blob/master/niiAnnotationProjectPostProcessingOfAnnotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.5.1


In [None]:
pip install retrying

Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying
Successfully installed retrying-1.3.4


In [None]:
from google.colab import drive
from sklearn.metrics import cohen_kappa_score
from retrying import retry
import os
import csv
import pandas as pd
import numpy as np

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openpyxl



In [None]:
# Install the required libraries
!pip install gspread google-auth

# Import libraries
import gspread
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.auth import default



In [None]:
# Authenticate and create a client
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
PAIRS_OF_ANNOTATORS = [
    ("Juan-san", "Ken-san"), ("Florian-sensei", "Leane-san"), ("Jonas-san", "Leane-san"),
    ("Florian-sensei", "Jonas-san"), ("Tom-san", "Jiahao-san"), ("Julian-san", "Xanh-san")]

In [None]:
PAIRS_OF_ANNOTATORS[0][0]

'Juan-san'

In [None]:
# Define the root directory of your Google Drive folder
root_dir = '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples'
csv_dir = root_dir + '/annotationsTransformedIntoCsv'

## Check if there are any non-annotated examples:

In [None]:
def check_files_in_subfolders(root_dir):
    missing_files_report = {}

    for subdir, dirs, files in os.walk(root_dir):
        if subdir == root_dir:
            continue  # Skip the root directory itself

        csv_files = {os.path.splitext(file)[0]: file for file in files if file.endswith('.csv')}
        google_sheets_files = {os.path.splitext(file)[0]: file for file in files if not file.endswith('.csv')}

        subdir_report = {
            'missing_csv': [],
            'missing_gsheet': []
        }

        # Check for missing CSV files
        for key in google_sheets_files:
            if key not in csv_files:
                subdir_report['missing_csv'].append(google_sheets_files[key])

        # Check for missing Google Sheets files
        for key in csv_files:
            if key not in google_sheets_files:
                subdir_report['missing_gsheet'].append(csv_files[key])

        if subdir_report['missing_csv'] or subdir_report['missing_gsheet']:
            missing_files_report[subdir] = subdir_report

    return missing_files_report

missing_files = check_files_in_subfolders(root_dir)

for subdir, report in missing_files.items():
    print(f"Subfolder: {subdir}")
    if report['missing_csv']:
        print("  Missing CSV files:")
        for file in report['missing_csv']:
            print(f"    {file}")
    if report['missing_gsheet']:
        print("  Missing Google Sheets files:")
        for file in report['missing_gsheet']:
            print(f"    {file}")


Subfolder: /content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/annotationsTransformedIntoCsv/Leane-san
  Missing Google Sheets files:
    counternarratives_annotator_7_hate_speech_13.csv
    counternarratives_annotator_7_hate_speech_10.csv
    counternarratives_annotator_7_hate_speech_12.csv
    counternarratives_annotator_7_hate_speech_1.csv
    counternarratives_annotator_7_hate_speech_11.csv
    counternarratives_annotator_7_hate_speech_8.csv
    counternarratives_annotator_7_hate_speech_36.csv
    counternarratives_annotator_7_hate_speech_9.csv
    counternarratives_annotator_7_hate_speech_7.csv
    counternarratives_annotator_7_hate_speech_24.csv
    counternarratives_annotator_7_hate_speech_23.csv
    counternarratives_annotator_7_hate_speech_20.csv
    counternarratives_annotator_7_hate_speech_6.csv
    counternarratives_annotator_7_hate_speech_15.csv
    counternarratives_annotator_7_hate_speech_5.csv
    counternarratives_annotator_7_hate_speech_38.csv
    counternar

## Check which annotators share counternarratives (and how many)

In [None]:
def get_first_csv_file(subdir):
    for file in os.listdir(subdir):
        if file.endswith('.csv'):
            return os.path.join(subdir, file)
    return None

def extract_elements_from_csv(file_path):
    elements = []
    with open(file_path, mode='r', encoding='utf-8-sig') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            if i >= 2:  # Start from the third line (index 2)
                if len(row) > 1:  # Ensure the row has at least two columns
                    elements.append(row[1])  # Take the second column (index 1)
    return elements

def compare_csv_files(root_dir):
    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    first_files = {subdir: get_first_csv_file(subdir) for subdir in subfolders}

    comparisons = []
    compared_pairs = set()

    for i, (subdir1, file1) in enumerate(first_files.items()):
        if file1:
            elements1 = set(extract_elements_from_csv(file1))
            for j, (subdir2, file2) in enumerate(first_files.items()):
                if i < j and file2:  # Ensure each pair is only compared once
                    elements2 = set(extract_elements_from_csv(file2))
                    common_elements = elements1.intersection(elements2)
                    total_elements = len(elements1)
                    if total_elements > 0 and common_elements:  # Only consider if there are common elements
                        percentage = (len(common_elements) / total_elements) * 100
                        comparisons.append({
                            'subfolder1': subdir1,
                            'subfolder2': subdir2,
                            'percentage': percentage,
                            'total_repeated': len(common_elements),
                            'repeated_elements': list(common_elements)
                        })

    return comparisons

# Run the comparison
comparison_results = compare_csv_files(root_dir)

# Print the results
for result in comparison_results:
    if result['total_repeated'] > 0:
        print(f"Comparison between {result['subfolder1']} and {result['subfolder2']}:")
        print(f"  Percentage of repeated elements: {result['percentage']:.2f}%")
        print(f"  Total number of repeated elements: {result['total_repeated']}")
        print(f"  Repeated elements: {result['repeated_elements']}")
        print()

Comparison between /content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Juan-san and /content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Ken-san:
  Percentage of repeated elements: 33.33%
  Total number of repeated elements: 6
  Repeated elements: ['Replace islam with any other religion and see just how deeply offensive what you just said is.', "What happened to the good old British values of stuff like 'tolerance' and 'respect for the rule of law' - what you are suggesting sounds like mob rule, vigilante action and people running around with burning torches?", 'This person may be the only one able to supply you some bread so, appreciate it!', "I do not shared your opinion, but I do acknowledge that opinions, mine or your, do not have to be based on fact or knowledge. My 'experience' of Muslims, as neighbours, colleagues, friends hasn't led me to view them as a problem.", 'Rapists are, in most cases, friends, family, or partners of the victim. The proportion of Pakis


Analysis of annotators that share examples:

* Florian-sensei and Léane-san and Jonas-san share 6 counter-narratives (that is 33.33% of Florian-sensei and Léane-san counter-narratives and 35.29% of Jonas-san counter-narratives -Jonas-san has one less counter-narrative than the other annotators-).

* Tom-san and Jiahao-san share 6 counter-narratives (33.33% of their counter-narratives).

* Julian san and Xanh-san share 6 counter-narratives (33.33% of their counter-narratives)

* Juan-san and Ken-san share 6 counter-narratives (33.33% of their counter-narratives).


In [None]:
comparison_results[0]

{'subfolder1': '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Juan-san',
 'subfolder2': '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Ken-san',
 'percentage': 33.33333333333333,
 'total_repeated': 6,
 'repeated_elements': ['Replace islam with any other religion and see just how deeply offensive what you just said is.',
  "What happened to the good old British values of stuff like 'tolerance' and 'respect for the rule of law' - what you are suggesting sounds like mob rule, vigilante action and people running around with burning torches?",
  'This person may be the only one able to supply you some bread so, appreciate it!',
  "I do not shared your opinion, but I do acknowledge that opinions, mine or your, do not have to be based on fact or knowledge. My 'experience' of Muslims, as neighbours, colleagues, friends hasn't led me to view them as a problem.",
  'Rapists are, in most cases, friends, family, or partners of the victim. The proportion of Pakistani

## Compare annotations of each annotator

### Load each annotators annotations.


In [None]:
def process_csv_files_in_folder(folder_path):
    """
    Processes all .csv files in a specified Google Drive folder.

    Args:
    folder_path (str): The path to the Google Drive folder containing .csv files.

    Returns:
    dict: A dictionary where keys are file names and values are pandas dataframes with the content of each csv file.
    """
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    dataframes = {}

    print(f"Found {len(csv_files)} CSV files in folder '{folder_path}'")

    for file in csv_files:
        print(f"Processing file: {file}")
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # Fix the first column (from the second row onward): ensure they are integers
        df.iloc[1:, 0] = pd.to_numeric(df.iloc[1:, 0], errors='coerce').fillna(0).astype(int)

        # Fix the second column in the first row: it should be empty
        df.iloc[0, 1] = ''

        # Fix the third column: it should be empty
        df.iloc[:, 2] = ''

        file_key = os.path.splitext(file)[0]  # Remove the .csv extension
        print("file_key", file_key)
        dataframes[file_key] = df
        print(f"Finished processing file: {file}")

    print("All files processed successfully.")
    return dataframes

In [None]:
# Example usage:
# example_dir = csv_dir + '/Juan-san'
# csv_data = process_csv_files_in_folder(example_dir)
# Now csv_data is a dictionary where keys are file names and values are pandas dataframes.

In [None]:
# csv_data['counternarratives_annotator_1_hate_speech_26']

In [None]:
def load_annotations_of_subfolders_in_root(dir):
    """
    Processes all subfolders in a given root directory, applying process_csv_files_in_folder to each.

    Args:
    dir (str): The root directory containing subfolders, each with .csv files.

    Returns:
    dict: A dictionary where keys are subfolder names and values are the results from process_csv_files_in_folder.
    """
    results = {}

    # Iterate through subfolders in the root directory
    for subdir in os.listdir(dir):
        subfolder_path = os.path.join(dir, subdir)
        if os.path.isdir(subfolder_path):
            # Process .csv files in the subfolder
            subfolder_data = process_csv_files_in_folder(subfolder_path)
            results[subdir] = subfolder_data

    return results

In [None]:
annotations_dic_of_dic_of_df = load_annotations_of_subfolders_in_root(csv_dir)

Found 45 CSV files in folder '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/annotationsTransformedIntoCsv/Leane-san'
Processing file: counternarratives_annotator_7_hate_speech_13.csv
file_key counternarratives_annotator_7_hate_speech_13
Finished processing file: counternarratives_annotator_7_hate_speech_13.csv
Processing file: counternarratives_annotator_7_hate_speech_10.csv
file_key counternarratives_annotator_7_hate_speech_10
Finished processing file: counternarratives_annotator_7_hate_speech_10.csv
Processing file: counternarratives_annotator_7_hate_speech_12.csv
file_key counternarratives_annotator_7_hate_speech_12
Finished processing file: counternarratives_annotator_7_hate_speech_12.csv
Processing file: counternarratives_annotator_7_hate_speech_1.csv
file_key counternarratives_annotator_7_hate_speech_1
Finished processing file: counternarratives_annotator_7_hate_speech_1.csv
Processing file: counternarratives_annotator_7_hate_speech_11.csv
file_key counternarrativ

In [None]:
annotations_dic_of_dic_of_df['Juan-san']['counternarratives_annotator_1_hate_speech_26']

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,Their Prophet was a paedophile.,,
1,0,Do you have the statistics for domestic violen...,
2,2,The real disgrace is that children are being f...,
3,0,And when CVs submitted under a non Muslim name...,
4,0,List of positive news stories.,
5,0,What happened to the good old British values o...,
6,2,Muslims contribute a lot to our society. What ...,
7,0,Replace islam with any other religion and see ...,
8,0,"As you said, it is extremism which does not re...",
9,0,Such dehumanising language against many people.,


In [None]:
def filter_dataframes(df1, df2):
    """
    Filters two pandas DataFrames based on a condition related to their second column values.
    It only keeps the rows where the value in the second column of df1 matches the value in the second column of df2.
    Finally sorts the filtered DataFrames based on the values in their second columns.

    Args:
    df1 (pd.DataFrame): First DataFrame.
    df2 (pd.DataFrame): Second DataFrame.

    Returns:
    tuple: A tuple containing two pandas DataFrames, filtered based on the condition and sorted by second column.
    """
    df1_copy = df1.copy()
    df2_copy = df2.copy()

    filtered_rows_df1_copy = []
    filtered_rows_df2_copy = []

    # Filter df1_copy based on df2_copy
    for i in range(len(df1_copy)):
        value = df1_copy.iloc[i, 1]  # Value in the second column of df1_copy row i
        if value in df2_copy.iloc[:, 1].values:
            filtered_rows_df1_copy.append(df1.iloc[i])

    # Filter df2_copy based on df1_copy
    for i in range(len(df2_copy)):
        value = df2_copy.iloc[i, 1]  # Value in the second column of df2_copy row i
        if value in df1_copy.iloc[:, 1].values:
            filtered_rows_df2_copy.append(df2_copy.iloc[i])

    # Create new DataFrames from filtered rows
    filtered_df1_copy = pd.DataFrame(filtered_rows_df1_copy, columns=df1_copy.columns)
    filtered_df2_copy = pd.DataFrame(filtered_rows_df2_copy, columns=df2_copy.columns)

    # One-liner to align DataFrames based on the values in their second columns
    filtered_df1_copy_sorted = filtered_df1_copy.iloc[filtered_df1_copy.iloc[:, 1].argsort()]
    filtered_df2_copy_sorted = filtered_df2_copy.iloc[filtered_df2_copy.iloc[:, 1].argsort()]

    return filtered_df1_copy_sorted.reset_index(drop=True), filtered_df2_copy_sorted.reset_index(drop=True)

In [None]:
filtered_common_rows_ordered = filter_dataframes(annotations_dic_of_dic_of_df['Juan-san']['counternarratives_annotator_1_hate_speech_1'], annotations_dic_of_dic_of_df['Ken-san']['counternarratives_annotator_2_hate_speech_1'])

In [None]:
filtered_common_rows_ordered[1]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,2,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,1,What happened to the good old British values o...,


In [None]:
def extract_id_of_annotator_from_key(dict_of_dict_of_df, external_key):
    """
    Extracts the integer 'i' from the key of the first internal dictionary of a nested dictionary.

    Args:
    dict_of_dict_of_df (dict): The nested dictionary containing internal dictionaries.
    external_key (hashable): Key to access the internal dictionary within dict_of_dict_of_df.

    Returns:
    int: The integer 'i' extracted from the internal key of the first internal dictionary.
    """
    dict_of_dict_of_df_copy = dict_of_dict_of_df.copy()
    # Extract the internal key
    internal_dict_key = next(iter(dict_of_dict_of_df_copy[external_key]))

    # Example key format: "counternarratives_annotator_i_hate_speech_j"
    internal_dict_key_parts = internal_dict_key.split('_')

    # Find the index of the annotators (the i in the example on the previous comment).
    annotator_index = internal_dict_key_parts.index('annotator')

    # Extract the id of the annotators which are located right after 'annotator'
    i = int(internal_dict_key_parts[annotator_index + 1])

    return i

In [None]:
annotations_dic_of_dic_of_df["Juan-san"].keys()

dict_keys(['counternarratives_annotator_1_hate_speech_25', 'counternarratives_annotator_1_hate_speech_17', 'counternarratives_annotator_1_hate_speech_15', 'counternarratives_annotator_1_hate_speech_16', 'counternarratives_annotator_1_hate_speech_38', 'counternarratives_annotator_1_hate_speech_14', 'counternarratives_annotator_1_hate_speech_34', 'counternarratives_annotator_1_hate_speech_43', 'counternarratives_annotator_1_hate_speech_42', 'counternarratives_annotator_1_hate_speech_26', 'counternarratives_annotator_1_hate_speech_23', 'counternarratives_annotator_1_hate_speech_36', 'counternarratives_annotator_1_hate_speech_1', 'counternarratives_annotator_1_hate_speech_39', 'counternarratives_annotator_1_hate_speech_13', 'counternarratives_annotator_1_hate_speech_24', 'counternarratives_annotator_1_hate_speech_27', 'counternarratives_annotator_1_hate_speech_28', 'counternarratives_annotator_1_hate_speech_35', 'counternarratives_annotator_1_hate_speech_44', 'counternarratives_annotator_1

In [None]:
# Example usage:
external_dict = {
    'key1': {
        'counternarratives_annotator_3_hate_speech_1': 1,
        'counternarratives_annotator_3_hate_speech_2': 2
    },
    'key2': {
        'counternarratives_annotator_6_hate_speech_1': 3,
        'counternarratives_annotator_6_hate_speech_2': 4
    }
}

# Specify external key and internal key to extract 'i'
external_key_1 = 'key1'
external_key_2 = 'key2'

# Extract 'i' from the internal key
i_value = extract_id_of_annotator_from_key(external_dict, external_key_1)
k_value = extract_id_of_annotator_from_key(external_dict, external_key_2)
print(f"The value of 'i' extracted from '{external_key_1}' is: {i_value}")
print(f"The value of 'k' extracted from '{external_key_2}' is: {k_value}")

The value of 'i' extracted from 'key1' is: 3
The value of 'k' extracted from 'key2' is: 6


In [None]:
def compare_hate_speech(dfs_dict1, dfs_dict2, df_key1, df_key2):
    """
    Compare the values of the hate speech contained in both dictionaries.

    Parameters:
    dfs_dict1 (dict): The first dictionary containing DataFrames.
    dfs_dict2 (dict): The second dictionary containing DataFrames.
    df_key1 (str): The key to access the DataFrame in the first dictionary.
    df_key2 (str): The key to access the DataFrame in the second dictionary.

    Returns:
    bool: True if the values are the same, False otherwise.
    """
    dfs_dict1_copy = dfs_dict1.copy()
    dfs_dict2_copy = dfs_dict2.copy()
    try:
        df1 = dfs_dict1_copy[df_key1]
        df2 = dfs_dict2_copy[df_key2]

        value1 = df1.at[0, 'hate speech']
        value2 = df2.at[0, 'hate speech']
    except KeyError as e:
        print(f"KeyError: {e}")
        return False
    except IndexError as e:
        print(f"IndexError: {e}")
        return False

    return value1 == value2



In [None]:
# Example usage:
data1 = {'df1': pd.DataFrame({'hate speech': ['value1', 'value2']})}
data2 = {'df2': pd.DataFrame({'hate speech': ['value1', 'value3']})}

result = compare_hate_speech(data1, data2, 'df1', 'df2')
print(result)  # This should print: True

True


In [None]:
compare_hate_speech(annotations_dic_of_dic_of_df["Juan-san"],annotations_dic_of_dic_of_df["Ken-san"], 'counternarratives_annotator_1_hate_speech_30', 'counternarratives_annotator_2_hate_speech_30')

True

In [None]:
# Fixme: creo que la voy a tener que completar o eliminar.
# Define a custom exception
class KeyCountMismatchException(Exception):
    pass

In [None]:
def compare_every_hatespeech(dict1, dict2, id_annotator_a, id_annotator_b):
    """
    Checks wether the hate speeches in dict1 and dict2 (dictionaries of annotations by two annotators are the same). I.e: checks if the annotator have annotated the same hate speeches in the same order.

    Parameters:
    dict1 (dict): The first dictionary.
    dict2 (dict): The second dictionary.
    id_annotator_a: The id of the first annotator.
    id_annotator_b: The id of the second annotator.
    """
    if len(dict1) != len(dict2):
        print("id_annotator_a",len(dict1))
        print("id_annotator_b",len(dict2))
        raise KeyCountMismatchException("The two dictionaries do not have the same number of keys.")

    all_match = True
    for i, key in enumerate(dict1.keys(), start=1):
        new_key_annotator_a = f'counternarratives_annotator_{id_annotator_a}_hate_speech_{i}'
        new_key_annotator_b = f'counternarratives_annotator_{id_annotator_b}_hate_speech_{i}'
        if not compare_hate_speech(dict1, dict2, new_key_annotator_a, new_key_annotator_b):
            all_match = False
    return all_match

In [None]:
def process_annotator_data(dict_of_dict_of_df, annotator_a, annotator_b):
    """
    Processes and compares hate speech data between two annotators. Returns the annotations of the examples of the annotator a that the annotator b has also annotated and the other way arround.
    Parameters:
    dict_of_dict_of_df (dict): A dictionary where keys are annotator identifiers and values are dictionaries of DataFrames.
    annotator_a (str): The key for the first annotator in the dict_of_dict_of_df.
    annotator_b (str): The key for the second annotator in the dict_of_dict_of_df.

    Returns:
    tuple: Two lists of filtered DataFrames:
           (annotations_by_annotator_a_shared_with_b, annotations_by_annotator_b_shared_with_a).

    Raises:
    ValueError: If the annotators do not have the same number of keys or if hate speech comparison fails.
    """

    dict_of_dict_of_df_copy = dict_of_dict_of_df.copy()
    # Step 1: Extract IDs of the annotators
    id_a = extract_id_of_annotator_from_key(dict_of_dict_of_df_copy, annotator_a)
    id_b = extract_id_of_annotator_from_key(dict_of_dict_of_df_copy, annotator_b)

    # Step 2: Check the hate speech comparison condition
    if not compare_every_hatespeech(dict_of_dict_of_df_copy[annotator_a], dict_of_dict_of_df_copy[annotator_b], id_a, id_b):
        raise ValueError(f"The annotators {annotator_a} and {annotator_b} have annotated different hate speech (or in different ordrer).")

    # Step 3: Check if both annotators have the same amount of keys
    if len(dict_of_dict_of_df_copy[annotator_a]) != len(dict_of_dict_of_df_copy[annotator_b]):
        raise ValueError(f"{annotator_a} and {annotator_b} do not have the same number hate speeches annotated.")

    annotations_by_annotator_a_shared_with_b = []
    annotations_by_annotator_b_shared_with_a = []

    # Step 4: Loop throu_gh the keys and process the data
    for i in range(1, len(dict_of_dict_of_df_copy[annotator_a]) + 1):
        annotator_a_hatespeech_i = f'counternarratives_annotator_{id_a}_hate_speech_{i}'
        annotator_b_hatespeech_i = f'counternarratives_annotator_{id_b}_hate_speech_{i}'
        filtered_a, filtered_b = filter_dataframes(
            dict_of_dict_of_df_copy[annotator_a][annotator_a_hatespeech_i],
            dict_of_dict_of_df_copy[annotator_b][annotator_b_hatespeech_i]
        )

        annotations_by_annotator_a_shared_with_b.append(filtered_a)
        annotations_by_annotator_b_shared_with_a.append(filtered_b)

    return annotations_by_annotator_a_shared_with_b, annotations_by_annotator_b_shared_with_a


In [None]:
 annotations_by_annotator_a_shared_with_b, annotations_by_annotator_b_shared_with_a = process_annotator_data(annotations_dic_of_dic_of_df, "Juan-san", "Ken-san")

In [None]:
annotations_by_annotator_a_shared_with_b[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,1,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,0,What happened to the good old British values o...,


In [None]:
annotations_by_annotator_b_shared_with_a[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,2,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,1,What happened to the good old British values o...,


### Evaluate how many hate speeches are shared by the 3 annotators that share hate speeches among them

In [None]:
 annotations_by_annotator_f_shared_with_l, annotations_by_annotator_l_shared_with_f = process_annotator_data(annotations_dic_of_dic_of_df, "Florian-sensei", "Leane-san")

In [None]:
len(annotations_by_annotator_f_shared_with_l)

45

In [None]:
len(annotations_by_annotator_l_shared_with_f)

45

In [None]:
annotations_by_annotator_f_shared_with_l[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,2,We have no right to blame other people. Analys...,
5,0,Why are you attacking muslims? Many people com...,
6,2,Why do you link rapist crimes and Muslim peopl...,


In [None]:
annotations_by_annotator_l_shared_with_f[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,2,We have no right to blame other people. Analys...,
5,0,Why are you attacking muslims? Many people com...,
6,1,Why do you link rapist crimes and Muslim peopl...,


In [None]:
 annotations_by_annotator_j_shared_with_l, annotations_by_annotator_l_shared_with_j = process_annotator_data(annotations_dic_of_dic_of_df, "Jonas-san", "Leane-san")

In [None]:
len(annotations_by_annotator_j_shared_with_l)

45

In [None]:
len(annotations_by_annotator_l_shared_with_j)

45

In [None]:
annotations_by_annotator_j_shared_with_l[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,2,We have no right to blame other people. Analys...,
5,0,Why are you attacking muslims? Many people com...,
6,2,Why do you link rapist crimes and Muslim peopl...,


In [None]:
annotations_by_annotator_l_shared_with_j[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,2,We have no right to blame other people. Analys...,
5,0,Why are you attacking muslims? Many people com...,
6,1,Why do you link rapist crimes and Muslim peopl...,


In [None]:
 annotations_by_annotator_j_shared_with_f, annotations_by_annotator_f_shared_with_j = process_annotator_data(annotations_dic_of_dic_of_df, "Jonas-san", "Florian-sensei")

In [None]:
len(annotations_by_annotator_j_shared_with_f)

45

In [None]:
len(annotations_by_annotator_f_shared_with_j)

45

In [None]:
annotations_by_annotator_j_shared_with_f[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,It is time for us to take Islam for what it is...,,
1,1,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,0,We have no right to blame other people. Analys...,
5,1,Why are you attacking muslims? Many people com...,
6,0,Why do you link rapist crimes and Muslim peopl...,


In [None]:
annotations_by_annotator_f_shared_with_j[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,It is time for us to take Islam for what it is...,,
1,0,Apart from all the scientists that are also mu...,
2,0,Had lunch with some Muslim friends recently. S...,
3,0,I thought 'Totalitarianism' was the requiremen...,
4,0,We have no right to blame other people. Analys...,
5,0,Why are you attacking muslims? Many people com...,
6,0,Why do you link rapist crimes and Muslim peopl...,


## Continue processing the annotations


In [None]:
def concatenate_list_of_dataframes(list_of_dfs):
    """
    Concatenates a list of pandas DataFrames into one DataFrame after modifying each DataFrame as follows:
    For each DataFrame in the list:
    - Takes the hate speech (string value) from ("hate speech", 0) and assigns it to the third column, second row.
    - Removes the first row from each DataFrame.

    Args:
    - list_of_dfs (list of pandas.DataFrame): List containing pandas DataFrames to be processed.

    Returns:
    - pandas.DataFrame: Concatenated DataFrame containing all data from the input list of DataFrames,
      with modifications applied.
    """
    # Iterate through each dataframe in the list
    for df in list_of_dfs:
        # Check if dataframe has at least 1 row and 3 columns
        if len(df.index) > 0 and len(df.columns) > 2:
            # Assign the value from ("hate speech", 0) to (1, 2)
            df.iloc[1, 2] = df.iloc[0, 0]  # Assuming the value is at (0, 0)
        # Drop the first row
        df.drop(df.index[0], inplace=True)

    # Concatenate all dataframes in the list into one big dataframe
    concatenated_df = pd.concat(list_of_dfs, ignore_index=True)

    return concatenated_df


In [None]:
 annotations_by_annotator_a_shared_with_b_df = concatenate_list_of_dataframes(annotations_by_annotator_a_shared_with_b)

In [None]:
annotations_by_annotator_a_shared_with_b_df

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,0,"I do not shared your opinion, but I do acknowl...",If Muslims counteracted child rapists in their...
1,0,List of positive news stories.,
2,1,"Rapists are, in most cases, friends, family, o...",
3,1,Replace islam with any other religion and see ...,
4,0,This person may be the only one able to supply...,
...,...,...,...
265,0,List of positive news stories.,
266,0,"Rapists are, in most cases, friends, family, o...",
267,2,Replace islam with any other religion and see ...,
268,0,This person may be the only one able to supply...,


In [None]:
def extract_annotations_from_df_into_array(df):
    """
    Extracts the contents of the second column of a DataFrame into an array.

    Args:
    - df (pandas.DataFrame): The input DataFrame.

    # Returns:
    - numpy.ndarray: An array containing the contents of the second column of the DataFrame.
    """
    # Check if the dataframe has at least two columns
    if df.shape[1] < 2:
        raise ValueError("DataFrame must have at least two columns")

    # Extract the second column and convert it to an array
    annotations_array = df.iloc[:, 0].to_numpy()

    return annotations_array

In [None]:
annotations_by_annotator_a_shared_with_b_array = extract_annotations_from_df_into_array(annotations_by_annotator_a_shared_with_b_df)

In [None]:
len(annotations_by_annotator_a_shared_with_b_array)

270

In [None]:
def get_arrays_of_annotations_of_annotators_a_and_b(dict_of_dict_of_df, annotator_a, annotator_b):
    """
    Gets the annotations of annotators A and B and returns them as arrays.

    Args:
    - dict_of_dict_of_df (dict): Dictionary of dictionaries of DataFrames.
    - annotator_a (str): Annotator A.
    - annotator_b (str): Annotator B.

    Returns:
    - tuple: Two arrays containing annotations by annotator A shared with B and annotator B shared with A.
    """
    dict_of_dict_of_df_copy = dict_of_dict_of_df.copy()

    annotations_by_annotator_a_shared_with_b, annotations_by_annotator_b_shared_with_a = process_annotator_data(dict_of_dict_of_df_copy, annotator_a, annotator_b)

    annotations_by_annotator_a_shared_with_b_df = concatenate_list_of_dataframes(annotations_by_annotator_a_shared_with_b)
    annotations_by_annotator_b_shared_with_a_df = concatenate_list_of_dataframes(annotations_by_annotator_b_shared_with_a)

    annotations_by_annotator_a_shared_with_b_array = extract_annotations_from_df_into_array(annotations_by_annotator_a_shared_with_b_df)
    annotations_by_annotator_b_shared_with_a_array = extract_annotations_from_df_into_array(annotations_by_annotator_b_shared_with_a_df)

    return annotations_by_annotator_a_shared_with_b_array, annotations_by_annotator_b_shared_with_a_array, annotations_by_annotator_a_shared_with_b_df, annotations_by_annotator_b_shared_with_a_df

In [None]:
annotations_by_annotator_a_shared_with_b_array, annotations_by_annotator_b_shared_with_a_array, annotations_by_annotator_a_shared_with_b_df, annotations_by_annotator_b_shared_with_a_df = get_arrays_of_annotations_of_annotators_a_and_b(annotations_dic_of_dic_of_df, "Juan-san", "Ken-san")

In [None]:
def preprocess_annotations(annotations, valid_labels={0, 1, 2}):
    """
    Preprocess the annotations by identifying and reporting any invalid annotations,
    and converting the valid annotations to numeric values.

    Args:
    annotations (pd.Series): The series of annotations to preprocess.
    valid_labels (set, optional): A set of valid annotation labels. Defaults to {0, 1, 2}.

    Returns:
    pd.Series: The annotations converted to numeric values, with non-numeric values set to NaN.
    """
    # Check for invalid annotations
    mask_invalid = ~np.isin(annotations, list(valid_labels))
    invalid_annotations = annotations[mask_invalid]
    if invalid_annotations.size > 0:
        print(f"Invalid annotations found: {invalid_annotations}")

    # Convert annotations to numeric values
    annotations_numeric = np.array(pd.to_numeric(annotations, errors='coerce'))

    return annotations_numeric

In [None]:
def compute_cohen_kappa_for_all_annotator_pairs(dict_of_dict_of_df, pairs_of_annotators):
    """
    Computes the Cohen's kappa score for each pair of annotators in the given list.

    Args:
    - dict_of_dict_of_df (dict): Dictionary of dictionaries of DataFrames.
    - pairs_of_annotators (list of tuples): List of tuples where each tuple contains two annotator names.

    Returns:
    - list of tuples: Each tuple contains annotator_a, annotator_b, and their Cohen's kappa score.
    """
    results = []
    dict_of_dict_of_df_copy = dict_of_dict_of_df.copy()

    for annotator_a, annotator_b in pairs_of_annotators:
        print(f"Computing Cohen's kappa score for annotators {annotator_a} and {annotator_b}...")
        # Get the annotation arrays for the pair of annotators
        annotations_by_annotator_a_shared_with_b_array, annotations_by_annotator_b_shared_with_a_array, _, _ = get_arrays_of_annotations_of_annotators_a_and_b(dict_of_dict_of_df, annotator_a, annotator_b)

        # Preprocess annotations
        annotations_by_annotator_a_shared_with_b_array_preprocessed = preprocess_annotations(annotations_by_annotator_a_shared_with_b_array)
        annotations_by_annotator_b_shared_with_a_array_preprocessed = preprocess_annotations(annotations_by_annotator_b_shared_with_a_array)

        # Ensure equal length after preprocessing
        if not (len(annotations_by_annotator_a_shared_with_b_array_preprocessed) == len(annotations_by_annotator_a_shared_with_b_array) == len(annotations_by_annotator_b_shared_with_a_array_preprocessed) == len(annotations_by_annotator_b_shared_with_a_array)):
            raise ValueError("The arrays must have the same length after preprocessing.")

        # Calculate the Cohen's kappa score
        """
        print("annotations_by_annotator_a_shared_with_b_array",annotations_by_annotator_a_shared_with_b_array_preprocessed)
        print("annotations_by_annotator_b_shared_with_a_array",annotations_by_annotator_b_shared_with_a_array_preprocessed)
        print("annotations_by_annotator_a_shared_with_b_array.shape",annotations_by_annotator_a_shared_with_b_array_preprocessed.shape)
        print("annotations_by_annotator_b_shared_with_a_array.shape",annotations_by_annotator_b_shared_with_a_array_preprocessed.shape)
        print("annotations_by_annotator_a_shared_with_b_array.dtype",annotations_by_annotator_a_shared_with_b_array_preprocessed.dtype)
        print("annotations_by_annotator_b_shared_with_a_array.dtype",annotations_by_annotator_b_shared_with_a_array_preprocessed.dtype)
        """
        cohen_kappa_score_for_annotators = cohen_kappa_score(
            annotations_by_annotator_a_shared_with_b_array_preprocessed,
            annotations_by_annotator_b_shared_with_a_array_preprocessed
        )

        # Print the result
        print(f"Annotator A: {annotator_a}, Annotator B: {annotator_b}, Cohen's kappa score: {cohen_kappa_score_for_annotators}")

        # Append the result to the list
        results.append((annotator_a, annotator_b, cohen_kappa_score_for_annotators))

    return results


In [None]:
annotations_dic_of_dic_of_df.keys()

dict_keys(['Leane-san', 'Juan-san', 'Ken-san', 'Tom-san', 'Jiahao-san', 'Julian-san', 'Xanh-san', 'Florian-sensei', 'Jonas-san'])

In [None]:
compute_cohen_kappa_for_all_annotator_pairs(annotations_dic_of_dic_of_df, PAIRS_OF_ANNOTATORS)

Computing Cohen's kappa score for annotators Juan-san and Ken-san...
Annotator A: Juan-san, Annotator B: Ken-san, Cohen's kappa score: 0.3807339449541285
Computing Cohen's kappa score for annotators Florian-sensei and Leane-san...
Annotator A: Florian-sensei, Annotator B: Leane-san, Cohen's kappa score: 0.5340949660835416
Computing Cohen's kappa score for annotators Jonas-san and Leane-san...
Annotator A: Jonas-san, Annotator B: Leane-san, Cohen's kappa score: 0.5163527100427598
Computing Cohen's kappa score for annotators Florian-sensei and Jonas-san...
Annotator A: Florian-sensei, Annotator B: Jonas-san, Cohen's kappa score: 0.4149683766690091
Computing Cohen's kappa score for annotators Tom-san and Jiahao-san...
Annotator A: Tom-san, Annotator B: Jiahao-san, Cohen's kappa score: 0.05061410459587956
Computing Cohen's kappa score for annotators Julian-san and Xanh-san...
Annotator A: Julian-san, Annotator B: Xanh-san, Cohen's kappa score: 0.15421686746987961


[('Juan-san', 'Ken-san', 0.3807339449541285),
 ('Florian-sensei', 'Leane-san', 0.5340949660835416),
 ('Jonas-san', 'Leane-san', 0.5163527100427598),
 ('Florian-sensei', 'Jonas-san', 0.4149683766690091),
 ('Tom-san', 'Jiahao-san', 0.05061410459587956),
 ('Julian-san', 'Xanh-san', 0.15421686746987961)]

# Unifying Disagreement Between Annotators

In [None]:
def unifiy_annotations_of_the_second_coulmn_of_two_dataframes(df1, df1Name, df2, df2Name):
    """
    Compare annotations in the first column of two pandas DataFrames that have the same values on the second column
    and unify them based on specific rules.

    Args:
    - df1 (pd.DataFrame): First DataFrame to compare.
    - df1Name (str): Name of the first DataFrame for notification purposes.
    - df2 (pd.DataFrame): Second DataFrame to compare.
    - df2Name (str): Name of the second DataFrame for notification purposes.

    Returns:
    - unification_of_df1_and_df2 (pd.DataFrame): unification of both df1 and df2.
    - notifications (list): List of notifications describing disagreements found during annotation unification.

    Raises:
    - AssertionError: If all values in the second column do not match between df1 and df2 after the first row.

    Description:
    This function compares the annotations (values) in the first column (index 1) of two DataFrames (df1 and df2).
    It applies specific rules to unify annotations:
    - If both annotations are the same, no change is made.
    - If one annotation is 2 and the other is 1, both are set to 2.
    - If one annotation is 0 and the other is 2, the original values are preserved, and a notification is appended.
    - If one annotation is 0 and the other is 1, both are set to 0, and a notification is appended.

    Before modification, the function checks if all values in the second column match between df1 and df2,
    raising an AssertionError if they do not.

    Notifications are stored in the notifications list for user review and further action if needed.
    """
    # Copy the original dataframes to avoid modifying them
    unification_of_df1_and_df2 = df1.copy()

    # List to store notifications for user review
    notifications = []

    # Check if every value in the second column matches between df1 and df2
    column_values_match = all(df1.iloc[1:, 1] == df2.iloc[1:, 1])
    assert column_values_match, f"The DataFrames {df1Name} and {df2Name} do not have matching values in the second column for all rows after the first row."

    # Iterate over the rows and compare the first column (excluding the first row)
    for idx in range(1, len(df1)):
        val1 = int(df1.iloc[idx, 0])
        val2 = int(df2.iloc[idx, 0])
        # Apply rules based on the value comparison
        if val1 == val2:
            # Same values, no change needed
            unification_of_df1_and_df2.iloc[idx, 0] = val1
        elif (val1 == 2 and val2 == 1) or (val1 == 1 and val2 == 2):
            # One value is 2 and the other is 1, set the unified value to 2
            unification_of_df1_and_df2.iloc[idx, 0] = 2
        elif (val1 == 0 and val2 == 2) or (val1 == 2 and val2 == 0):
            # One value is 0 and the other is 2, set the unified value to 1 and notify
            unification_of_df1_and_df2.iloc[idx, 0] = 1
            notifications.append(f"The DataFrames {df1Name} and {df2Name} have a disagreement on line {idx+1}. They have annotated {val1} and {val2} respectively. Both have been annotated as 1.")
        elif (val1 == 0 and val2 == 1) or (val1 == 1 and val2 == 0):
            # One value is 0 and the other is 1, set unified value to 1 and notify
            unification_of_df1_and_df2.iloc[idx, 0] = 1
            notifications.append(f"Both annotations have been setted to one, but the DataFrames {df1Name} and {df2Name} have a disagreement on line {idx+1}. They have annotated {val1} and {val2} respectively.")

    return unification_of_df1_and_df2, notifications


In [None]:
#Example usage:
df1 = pd.DataFrame({'A': [0, 2, 1,2], 'B': [1, 2, 3, 4]})
df2 = pd.DataFrame({'A': [2, 1, 0,2], 'B':[1, 2, 3, 4]})
df1_mod, notes = unifiy_annotations_of_the_second_coulmn_of_two_dataframes(df1, 'df1', df2, 'df2')
print(df1)
print(df2)
print(df1_mod)
print(notes)

   A  B
0  0  1
1  2  2
2  1  3
3  2  4
   A  B
0  2  1
1  1  2
2  0  3
3  2  4
   A  B
0  0  1
1  2  2
2  0  3
3  2  4
['Both annotations have been setted to zero, but the DataFrames df1 and df2 have a disagreement on line 3. They have annotated 1 and 0 respectively.']


In [None]:
 annotations_by_annotator_a_shared_with_b, annotations_by_annotator_b_shared_with_a = process_annotator_data(annotations_dic_of_dic_of_df, "Juan-san", "Ken-san")

In [None]:
annotations_by_annotator_a_shared_with_b[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,1,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,0,What happened to the good old British values o...,


In [None]:
len(annotations_by_annotator_a_shared_with_b)

45

In [None]:
annotations_by_annotator_b_shared_with_a[0]

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,2,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,1,What happened to the good old British values o...,


In [None]:
len(annotations_by_annotator_b_shared_with_a)

45

In [None]:
dfs_unified, notes = unifiy_annotations_of_the_second_coulmn_of_two_dataframes(annotations_by_annotator_a_shared_with_b[0], 'Juan-san', annotations_by_annotator_b_shared_with_a[0], 'Juan-san')


In [None]:
dfs_unified

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,2,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,0,What happened to the good old British values o...,


In [None]:
notes

['Both annotations have been setted to zero, but the DataFrames Juan-san and Juan-san have a disagreement on line 7. They have annotated 0 and 1 respectively.']

# Fixme: acá poner la siguiente función para unificar anotaciones de Florian, Leane y Jonas. unifiy_annotations_of_the_second_coulmn_of_three_dataframes_of_annotators_that_share_examples

In [None]:
def update_dataframe(df_original, df_of_updates):
    """
    Update the original DataFrame (`df_original`) based on the updates provided in another DataFrame (`df_of_updates`).

    Parameters:
    - df_original: pandas DataFrame, the original DataFrame to be updated.
    - df_of_updates: pandas DataFrame, contains updates where each row specifies a value to find and its replacement.

    Returns:
    - df_original: pandas DataFrame, the updated original DataFrame after applying all specified updates.

    Raises:
    - ValueError: If the value at cell (0, 0) is not the same in both DataFrames, or if not all elements of the second
                 column (from the second row onwards) of df_of_updates are in the second column (from the second row
                 onwards) of df_original.
    """

    # Check if the value at cell (0, 0) is the same in both DataFrames
    if df_original.iloc[0, 0] != df_of_updates.iloc[0, 0]:
        raise ValueError("The value at cell (0, 0) is not the same in both DataFrames")

    # Check if all elements of the second column (from the second row onwards) of df_of_updates are in the second column (from the second row onwards) of df_original
    if not df_of_updates.iloc[1:, 1].isin(df_original.iloc[1:, 1]).all():
        raise ValueError("Not all elements of the second column (from the second row onwards) of df_of_updates are in the second column (from the second row onwards) of df_original")

    # Iterate over the rows of df_of_updates (starting from the second row)
    for index, row in df_of_updates.iloc[1:].iterrows():
        value_to_find = row[1]
        replacement_value = row[0]

        # Find the row in df_original where the first column matches the value in df_of_updates
        df_original.loc[df_original.iloc[:, 1] == value_to_find, df_original.columns[0]] = replacement_value

    return df_original


In [None]:
# Example usage:
df1 = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['x', 'y', 'z', 'w']})
df2 = pd.DataFrame({'A': [1, 9, 8], 'B': ['x', 'w', 'y']})
print(df1)
print(df2)
updated_df1 = update_dataframe(df1, df2)
print(updated_df1)

   A  B
0  1  x
1  2  y
2  3  z
3  4  w
   A  B
0  1  x
1  9  w
2  8  y
   A  B
0  1  x
1  8  y
2  3  z
3  9  w


In [None]:
dfs_unified_copy = dfs_unified.copy()
dfs_unified_copy

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,"I do not shared your opinion, but I do acknowl...",
2,0,List of positive news stories.,
3,1,"Rapists are, in most cases, friends, family, o...",
4,2,Replace islam with any other religion and see ...,
5,0,This person may be the only one able to supply...,
6,0,What happened to the good old British values o...,


In [None]:
annotations_dic_of_dic_of_df['Juan-san'].keys()

dict_keys(['counternarratives_annotator_1_hate_speech_25', 'counternarratives_annotator_1_hate_speech_17', 'counternarratives_annotator_1_hate_speech_15', 'counternarratives_annotator_1_hate_speech_16', 'counternarratives_annotator_1_hate_speech_38', 'counternarratives_annotator_1_hate_speech_14', 'counternarratives_annotator_1_hate_speech_34', 'counternarratives_annotator_1_hate_speech_43', 'counternarratives_annotator_1_hate_speech_42', 'counternarratives_annotator_1_hate_speech_26', 'counternarratives_annotator_1_hate_speech_23', 'counternarratives_annotator_1_hate_speech_36', 'counternarratives_annotator_1_hate_speech_1', 'counternarratives_annotator_1_hate_speech_39', 'counternarratives_annotator_1_hate_speech_13', 'counternarratives_annotator_1_hate_speech_24', 'counternarratives_annotator_1_hate_speech_27', 'counternarratives_annotator_1_hate_speech_28', 'counternarratives_annotator_1_hate_speech_35', 'counternarratives_annotator_1_hate_speech_44', 'counternarratives_annotator_1

In [None]:
juan_san_annotation_hate_0_copy = annotations_dic_of_dic_of_df['Juan-san']['counternarratives_annotator_1_hate_speech_1'].copy()
juan_san_annotation_hate_0_copy

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,List of positive news stories.,
2,1,"Rapists are, in most cases, friends, family, o...",
3,0,And when CVs submitted under a non Muslim name...,
4,1,I am curious where did you get those thoughts ...,
5,0,What happened to the good old British values o...,
6,0,Do you have the statistics for domestic violen...,
7,0,This person may be the only one able to supply...,
8,0,The real disgrace is that children are being f...,
9,0,"I do not shared your opinion, but I do acknowl...",


In [None]:
update_dataframe(juan_san_annotation_hate_0_copy, dfs_unified_copy)

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,List of positive news stories.,
2,1,"Rapists are, in most cases, friends, family, o...",
3,0,And when CVs submitted under a non Muslim name...,
4,1,I am curious where did you get those thoughts ...,
5,0,What happened to the good old British values o...,
6,0,Do you have the statistics for domestic violen...,
7,0,This person may be the only one able to supply...,
8,0,The real disgrace is that children are being f...,
9,0,"I do not shared your opinion, but I do acknowl...",


In [None]:
list(annotations_dic_of_dic_of_df['Juan-san'].keys())[0]

'counternarratives_annotator_1_hate_speech_25'

In [None]:
annotations_dic_of_dic_of_df.keys()

dict_keys(['Leane-san', 'Juan-san', 'Ken-san', 'Tom-san', 'Jiahao-san', 'Julian-san', 'Xanh-san', 'Florian-sensei', 'Jonas-san'])

In [None]:
len(annotations_dic_of_dic_of_df['Juan-san'])

45

In [None]:
def unify_annotations_for_two_annotators(annotations_dic_of_dic_of_df, annotator_i, annotator_k):
    """
    Unifies all the annotations for annotator_i and annotator_k based on shared annotations and unified data.

    Parameters:
    - annotations_dic_of_dic_of_df: dict of dict of pandas DataFrames, the original annotations data.
    - annotator_i: str, identifier for the first annotator.
    - annotator_k: str, identifier for the second annotator.

    Returns:
    - annotations_updated_dic_of_dic_of_df: dict of dict of pandas DataFrames, the updated annotations data. Contains only the annotations for annotator_i and annotator_k (updated).
    """

    annotations_updated_dic_of_dic_of_df = {}
    notes_list = []  # List to store notes with identifiers

    # Process annotator data to get shared annotations
    annotations_by_annotator_i_shared_with_k, annotations_by_annotator_k_shared_with_i = process_annotator_data(annotations_dic_of_dic_of_df, annotator_i, annotator_k)

    # Get the initial keys for annotator_i and annotator_k
    annotator_i_key_template = list(annotations_dic_of_dic_of_df[annotator_i].keys())[0]
    annotator_k_key_template = list(annotations_dic_of_dic_of_df[annotator_k].keys())[0]

    for j in range(1, len(annotations_dic_of_dic_of_df[PAIRS_OF_ANNOTATORS[0][0]])+1):
        # Update the keys with the current value of j
        annotator_i_keys = '_'.join(annotator_i_key_template.rsplit('_', 1)[:-1] + [str(j)])
        annotator_k_keys = '_'.join(annotator_k_key_template.rsplit('_', 1)[:-1] + [str(j)])

        # Access the DataFrames for the annotators, making copies to avoid modifying the original datasets
        df_of_annotator_i_for_hatespeech_j = annotations_dic_of_dic_of_df[annotator_i][annotator_i_keys].copy()
        df_of_annotator_k_for_hatespeech_j = annotations_dic_of_dic_of_df[annotator_k][annotator_k_keys].copy()

        # Apply the unification process
        dfs_unified, notes = unifiy_annotations_of_the_second_coulmn_of_two_dataframes(
            annotations_by_annotator_i_shared_with_k[j-1], annotator_i,
            annotations_by_annotator_k_shared_with_i[j-1], annotator_k
        )

         # Store the notes with identifiers for later review
        notes_list.append({
            "annotator_i": annotator_i,
            "annotator_k": annotator_k,
            "hate_speech_index": j,
            "notes": notes
        })


        # Update the DataFrames with the unified annotations, making copies to avoid overwriting data
        df_of_annotator_i_for_hatespeech_j_updated = update_dataframe(df_of_annotator_i_for_hatespeech_j, dfs_unified)
        df_of_annotator_k_for_hatespeech_j_updated = update_dataframe(df_of_annotator_k_for_hatespeech_j, dfs_unified)

        # Initialize the nested dictionaries if not already initialized
        if annotator_i not in annotations_updated_dic_of_dic_of_df:
            annotations_updated_dic_of_dic_of_df[annotator_i] = {}
        if annotator_k not in annotations_updated_dic_of_dic_of_df:
            annotations_updated_dic_of_dic_of_df[annotator_k] = {}

        # Store the updated annotations
        annotations_updated_dic_of_dic_of_df[annotator_i][annotator_i_keys] = df_of_annotator_i_for_hatespeech_j_updated
        annotations_updated_dic_of_dic_of_df[annotator_k][annotator_k_keys] = df_of_annotator_k_for_hatespeech_j_updated

    return annotations_updated_dic_of_dic_of_df

# Example usage:
# annotations_dic_of_dic_of_df = {...}
# updated_annotations = unify_annotations(annotations_dic_of_dic_of_df, 'annotator_1', 'annotator_2')
# print(updated_annotations)


In [None]:
annotations_dic_of_dic_of_df_copy = annotations_dic_of_dic_of_df.copy()

In [None]:
annotations_dic_of_dic_of_df_copy_juan_san_and_ken_san_updated = unify_annotations_for_two_annotators(annotations_dic_of_dic_of_df_copy, 'Juan-san', 'Ken-san')

j: 1
j: 2
j: 3
j: 4
j: 5
j: 6
j: 7
j: 8
j: 9
j: 10
j: 11
j: 12
j: 13
j: 14
j: 15
j: 16
j: 17
j: 18
j: 19
j: 20
j: 21
j: 22
j: 23
j: 24
j: 25
j: 26
j: 27
j: 28
j: 29
j: 30
j: 31
j: 32
j: 33
j: 34
j: 35
j: 36
j: 37
j: 38
j: 39
j: 40
j: 41
j: 42
j: 43
j: 44
j: 45


In [None]:
annotations_dic_of_dic_of_df_copy_juan_san_and_ken_san_updated['Juan-san']['counternarratives_annotator_1_hate_speech_24']

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,There are muslim shops in Birmingham that do n...,,
1,0,"In my opinion, many people want to emigrate to...",
2,1,So much hatred. We must respect and understand...,
3,0,Muslims contribute a lot to our society. What ...,
4,0,This person may be the only one able to supply...,
5,0,Rapists will use any excuse to justify their a...,
6,0,Do you have the statistics for domestic violen...,
7,0,Such dehumanising language against many people.,
8,0,And when CVs submitted under a non Muslim name...,
9,0,The real disgrace is that children are being f...,


In [None]:
annotations_dic_of_dic_of_df_copy_juan_san_and_ken_san_updated.keys()

dict_keys(['Juan-san', 'Ken-san'])

## Unify the annotations for all pair of annoators

In [None]:
# Initialize an empty dictionary to store unified annotations
annotations_with_disagreement_unified_dic_of_dic_of_df = {}

# Iterate over each pair of annotators
for annotator_i, annotator_k in PAIRS_OF_ANNOTATORS:
    # Call unify_annotations_for_two_annotators to get annotations_updated_dic_of_dic_of_df
    annotations_updated_dic_of_dic_of_df = unify_annotations_for_two_annotators(annotations_dic_of_dic_of_df_copy, annotator_i, annotator_k)

    # Update annotations_with_disagreement_unified_dic_of_dic_of_df with annotations_updated_dic_of_dic_of_df
    annotations_with_disagreement_unified_dic_of_dic_of_df.update({
        annotator_i: annotations_updated_dic_of_dic_of_df[annotator_i],
        annotator_k: annotations_updated_dic_of_dic_of_df[annotator_k]
    })

# Now annotations_with_disagreement_unified_dic_of_dic_of_df contains unified annotations for all pairs


In [None]:
annotations_with_disagreement_unified_dic_of_dic_of_df.keys()

dict_keys(['Juan-san', 'Ken-san', 'Florian-sensei', 'Leane-san', 'Jonas-san', 'Tom-san', 'Jiahao-san', 'Julian-san', 'Xanh-san'])

In [None]:
annotations_with_disagreement_unified_dic_of_dic_of_df['Florian-sensei'].keys()

dict_keys(['counternarratives_annotator_9_hate_speech_1', 'counternarratives_annotator_9_hate_speech_2', 'counternarratives_annotator_9_hate_speech_3', 'counternarratives_annotator_9_hate_speech_4', 'counternarratives_annotator_9_hate_speech_5', 'counternarratives_annotator_9_hate_speech_6', 'counternarratives_annotator_9_hate_speech_7', 'counternarratives_annotator_9_hate_speech_8', 'counternarratives_annotator_9_hate_speech_9', 'counternarratives_annotator_9_hate_speech_10', 'counternarratives_annotator_9_hate_speech_11', 'counternarratives_annotator_9_hate_speech_12', 'counternarratives_annotator_9_hate_speech_13', 'counternarratives_annotator_9_hate_speech_14', 'counternarratives_annotator_9_hate_speech_15', 'counternarratives_annotator_9_hate_speech_16', 'counternarratives_annotator_9_hate_speech_17', 'counternarratives_annotator_9_hate_speech_18', 'counternarratives_annotator_9_hate_speech_19', 'counternarratives_annotator_9_hate_speech_20', 'counternarratives_annotator_9_hate_sp

In [None]:
len(annotations_with_disagreement_unified_dic_of_dic_of_df['Xanh-san'])

45

# Fixme: en annotations_with_disagreement_unified_dic_of_dic_of_df tengo que asegurarme que los tres que comparten anotaciones entre si (florian, Leane y Jonas), no copartan ningún ejemplo entre los tres anotadores a la vez (ej: que el odio_23 no haya sido anotado por los tres anotadores a la vez) y si fué así, tengo que ver cómo resuelvo esto en unify_annotations_for_two_annotators

# Compute Cohen Kappa after unifying disagreement between annotators:


In [None]:
compute_cohen_kappa_for_all_annotator_pairs(annotations_with_disagreement_unified_dic_of_dic_of_df, PAIRS_OF_ANNOTATORS)

Computing Cohen's kappa score for annotators Juan-san and Ken-san...
Annotator A: Juan-san, Annotator B: Ken-san, Cohen's kappa score: 1.0
Computing Cohen's kappa score for annotators Florian-sensei and Leane-san...
Annotator A: Florian-sensei, Annotator B: Leane-san, Cohen's kappa score: 0.614438714795716
Computing Cohen's kappa score for annotators Jonas-san and Leane-san...
Annotator A: Jonas-san, Annotator B: Leane-san, Cohen's kappa score: 0.614438714795716
Computing Cohen's kappa score for annotators Florian-sensei and Jonas-san...
Annotator A: Florian-sensei, Annotator B: Jonas-san, Cohen's kappa score: 1.0
Computing Cohen's kappa score for annotators Tom-san and Jiahao-san...
Annotator A: Tom-san, Annotator B: Jiahao-san, Cohen's kappa score: 1.0
Computing Cohen's kappa score for annotators Julian-san and Xanh-san...
Annotator A: Julian-san, Annotator B: Xanh-san, Cohen's kappa score: 1.0


[('Juan-san', 'Ken-san', 1.0),
 ('Florian-sensei', 'Leane-san', 0.614438714795716),
 ('Jonas-san', 'Leane-san', 0.614438714795716),
 ('Florian-sensei', 'Jonas-san', 1.0),
 ('Tom-san', 'Jiahao-san', 1.0),
 ('Julian-san', 'Xanh-san', 1.0)]

In [None]:
compute_cohen_kappa_for_all_annotator_pairs(annotations_dic_of_dic_of_df, PAIRS_OF_ANNOTATORS)

Computing Cohen's kappa score for annotators Juan-san and Ken-san...
Annotator A: Juan-san, Annotator B: Ken-san, Cohen's kappa score: 0.3807339449541285
Computing Cohen's kappa score for annotators Florian-sensei and Leane-san...
Annotator A: Florian-sensei, Annotator B: Leane-san, Cohen's kappa score: 0.5340949660835416
Computing Cohen's kappa score for annotators Jonas-san and Leane-san...
Annotator A: Jonas-san, Annotator B: Leane-san, Cohen's kappa score: 0.5163527100427598
Computing Cohen's kappa score for annotators Florian-sensei and Jonas-san...
Annotator A: Florian-sensei, Annotator B: Jonas-san, Cohen's kappa score: 0.4149683766690091
Computing Cohen's kappa score for annotators Tom-san and Jiahao-san...
Annotator A: Tom-san, Annotator B: Jiahao-san, Cohen's kappa score: 0.05061410459587956
Computing Cohen's kappa score for annotators Julian-san and Xanh-san...
Annotator A: Julian-san, Annotator B: Xanh-san, Cohen's kappa score: 0.15421686746987961


[('Juan-san', 'Ken-san', 0.3807339449541285),
 ('Florian-sensei', 'Leane-san', 0.5340949660835416),
 ('Jonas-san', 'Leane-san', 0.5163527100427598),
 ('Florian-sensei', 'Jonas-san', 0.4149683766690091),
 ('Tom-san', 'Jiahao-san', 0.05061410459587956),
 ('Julian-san', 'Xanh-san', 0.15421686746987961)]

#Tests


In [None]:
len(annotations_dic_of_dic_of_df["Juan-san"])

In [None]:
len(annotations_dic_of_dic_of_df["Ken-san"])

In [None]:
updated_annotations["Juan-san"].keys()

In [None]:
len(updated_annotations["Ken-san"])

In [None]:
annotations_dic_of_dic_of_df["Juan-san"]["counternarratives_annotator_1_hate_speech_26"]

In [None]:
# Create a copy of the data for testing:
annotations_dic_of_dic_of_df_copy_for_testing = annotations_dic_of_dic_of_df.copy()
# Update the annotations using the function

updated_annotations = unify_annotations(annotations_dic_of_dic_of_df_copy_for_testing, 'Juan-san', 'Ken-san')

# Fixme: poner esto en un assert:
len(annotations_dic_of_dic_of_df_copy_for_testing["Juan-san"]) == len(updated_annotations["Juan-san"])
len(annotations_dic_of_dic_of_df_copy_for_testing["Ken-san"]) == len(updated_annotations["Ken-san"])

# Print the updated annotations to verify the results
for annotator, annotations in updated_annotations.items():
    print(f"Annotations for {annotator}:")
    for key, df in annotations.items():
        print(f"  {key}:")
        print(df)


In [None]:
annotations_dic_of_dic_of_df_copy = annotations_dic_of_dic_of_df.copy()