<a href="https://colab.research.google.com/github/jnqeras/ARC/blob/master/niiAnnotationProjectPostProcessingOfAnnotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
from google.colab import drive
import os
import csv
import pandas as pd

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install openpyxl



In [52]:
# Install the required libraries
!pip install gspread google-auth

# Import libraries
import gspread
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.auth import default



In [53]:
# Authenticate and create a client
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

## Check if there are any non-annotated examples:

In [25]:
# Define the root directory of your Google Drive folder
root_dir = '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples'

In [24]:
def check_files_in_subfolders(root_dir):
    missing_files_report = {}

    for subdir, dirs, files in os.walk(root_dir):
        if subdir == root_dir:
            continue  # Skip the root directory itself

        csv_files = {os.path.splitext(file)[0]: file for file in files if file.endswith('.csv')}
        google_sheets_files = {os.path.splitext(file)[0]: file for file in files if not file.endswith('.csv')}

        subdir_report = {
            'missing_csv': [],
            'missing_gsheet': []
        }

        # Check for missing CSV files
        for key in google_sheets_files:
            if key not in csv_files:
                subdir_report['missing_csv'].append(google_sheets_files[key])

        # Check for missing Google Sheets files
        for key in csv_files:
            if key not in google_sheets_files:
                subdir_report['missing_gsheet'].append(csv_files[key])

        if subdir_report['missing_csv'] or subdir_report['missing_gsheet']:
            missing_files_report[subdir] = subdir_report

    return missing_files_report

missing_files = check_files_in_subfolders(root_dir)

for subdir, report in missing_files.items():
    print(f"Subfolder: {subdir}")
    if report['missing_csv']:
        print("  Missing CSV files:")
        for file in report['missing_csv']:
            print(f"    {file}")
    if report['missing_gsheet']:
        print("  Missing Google Sheets files:")
        for file in report['missing_gsheet']:
            print(f"    {file}")


## Check which annotators share counternarratives (and how many)

In [6]:
def get_first_csv_file(subdir):
    for file in os.listdir(subdir):
        if file.endswith('.csv'):
            return os.path.join(subdir, file)
    return None

def extract_elements_from_csv(file_path):
    elements = []
    with open(file_path, mode='r', encoding='utf-8-sig') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            if i >= 2:  # Start from the third line (index 2)
                if len(row) > 1:  # Ensure the row has at least two columns
                    elements.append(row[1])  # Take the second column (index 1)
    return elements

def compare_csv_files(root_dir):
    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    first_files = {subdir: get_first_csv_file(subdir) for subdir in subfolders}

    comparisons = []
    compared_pairs = set()

    for i, (subdir1, file1) in enumerate(first_files.items()):
        if file1:
            elements1 = set(extract_elements_from_csv(file1))
            for j, (subdir2, file2) in enumerate(first_files.items()):
                if i < j and file2:  # Ensure each pair is only compared once
                    elements2 = set(extract_elements_from_csv(file2))
                    common_elements = elements1.intersection(elements2)
                    total_elements = len(elements1)
                    if total_elements > 0 and common_elements:  # Only consider if there are common elements
                        percentage = (len(common_elements) / total_elements) * 100
                        comparisons.append({
                            'subfolder1': subdir1,
                            'subfolder2': subdir2,
                            'percentage': percentage,
                            'total_repeated': len(common_elements),
                            'repeated_elements': list(common_elements)
                        })

    return comparisons

# Run the comparison
comparison_results = compare_csv_files(root_dir)

# Print the results
for result in comparison_results:
    if result['total_repeated'] > 0:
        print(f"Comparison between {result['subfolder1']} and {result['subfolder2']}:")
        print(f"  Percentage of repeated elements: {result['percentage']:.2f}%")
        print(f"  Total number of repeated elements: {result['total_repeated']}")
        print(f"  Repeated elements: {result['repeated_elements']}")
        print()

Comparison between /content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Juan-san and /content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Ken-san:
  Percentage of repeated elements: 33.33%
  Total number of repeated elements: 6
  Repeated elements: ["I do not shared your opinion, but I do acknowledge that opinions, mine or your, do not have to be based on fact or knowledge. My 'experience' of Muslims, as neighbours, colleagues, friends hasn't led me to view them as a problem.", 'This person may be the only one able to supply you some bread so, appreciate it!', "What happened to the good old British values of stuff like 'tolerance' and 'respect for the rule of law' - what you are suggesting sounds like mob rule, vigilante action and people running around with burning torches?", 'List of positive news stories.', 'Rapists are, in most cases, friends, family, or partners of the victim. The proportion of Pakistani people who are rapists is no more than that of White Britis


Analysis of annotators that share examples:

* Florian-sensei and Léane-san and Jonas-san share 6 counter-narratives (that is 33.33% of Florian-sensei and Léane-san counter-narratives and 35.29% of Jonas-san counter-narratives -Jonas-san has one less counter-narrative than the other annotators-).

* Tom-san and Jiahao-san share 6 counter-narratives (33.33% of their counter-narratives).

* Julian san and Xhan-san share 6 counter-narratives (33.33% of their counter-narratives)

* Juan-san and Ken-san share 6 counter-narratives (33.33% of their counter-narratives).


In [7]:
comparison_results[0]

{'subfolder1': '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Juan-san',
 'subfolder2': '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Ken-san',
 'percentage': 33.33333333333333,
 'total_repeated': 6,
 'repeated_elements': ["I do not shared your opinion, but I do acknowledge that opinions, mine or your, do not have to be based on fact or knowledge. My 'experience' of Muslims, as neighbours, colleagues, friends hasn't led me to view them as a problem.",
  'This person may be the only one able to supply you some bread so, appreciate it!',
  "What happened to the good old British values of stuff like 'tolerance' and 'respect for the rule of law' - what you are suggesting sounds like mob rule, vigilante action and people running around with burning torches?",
  'List of positive news stories.',
  'Rapists are, in most cases, friends, family, or partners of the victim. The proportion of Pakistani people who are rapists is no more than that of White British po

## Compare annotations of each annotator

### Load each annotators annotations.


In [69]:
def list_of_lists_to_dataframe(data):
    """
    Converts a list of lists of strings into a pandas DataFrame.

    Args:
    data (list of lists): The input data where each sublist represents a row in the DataFrame.

    Returns:
    pd.DataFrame: A pandas DataFrame created from the input data.
    """
    # Assuming the first sublist contains the column headers
    headers = data[0]
    # Remaining sublists contain the data rows
    rows = data[1:]

    # Create the DataFrame
    df = pd.DataFrame(rows, columns=headers)

    return df


In [79]:
def process_gsheet_files_in_folder(folder_path):
    """
    Processes all .gsheet files in a specified Google Drive folder.

    Args:
    folder_path (str): The path to the Google Drive folder containing .gsheet files.

    Returns:
    dict: A dictionary where keys are file names and values are the contents of the first sheet of each Google Sheets file.
    """
    files_data = {}

    # Iterate through files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.gsheet'):
            # Read the Google Sheets URL from the .gsheet file

            print(file_name)
            try:
                # Open the Google Sheets file
                filename_without_extension = os.path.splitext(file_name)[0]
                spreadsheet = gc.open(filename_without_extension).sheet1
                # spreadsheet is a 'worksheet' object. Fetch the data
                data = spreadsheet.get_all_values()

                # Store the data in the dictionary
                files_data[filename_without_extension] = data

            except Exception as e:
                print(f"Error processing {filename_without_extension}: {e}")

    return files_data

In [89]:
# Example usage:
folder_path = '/content/drive/My Drive/niiAnnotationProjectAnnotatedExamples/Léane-san'
files_data = process_gsheet_files_in_folder(folder_path)

counternarratives_annotator_7_hate_speech_1.gsheet
counternarratives_annotator_7_hate_speech_2.gsheet
counternarratives_annotator_7_hate_speech_3.gsheet
counternarratives_annotator_7_hate_speech_4.gsheet
counternarratives_annotator_7_hate_speech_5.gsheet
counternarratives_annotator_7_hate_speech_6.gsheet
counternarratives_annotator_7_hate_speech_7.gsheet
counternarratives_annotator_7_hate_speech_8.gsheet
counternarratives_annotator_7_hate_speech_9.gsheet
counternarratives_annotator_7_hate_speech_10.gsheet
counternarratives_annotator_7_hate_speech_11.gsheet
counternarratives_annotator_7_hate_speech_12.gsheet
counternarratives_annotator_7_hate_speech_13.gsheet
counternarratives_annotator_7_hate_speech_14.gsheet
counternarratives_annotator_7_hate_speech_15.gsheet
counternarratives_annotator_7_hate_speech_16.gsheet
counternarratives_annotator_7_hate_speech_17.gsheet
counternarratives_annotator_7_hate_speech_18.gsheet
counternarratives_annotator_7_hate_speech_19.gsheet
counternarratives_ann

In [90]:
list_of_lists_to_dataframe(files_data['counternarratives_annotator_7_hate_speech_1'])

Unnamed: 0,hate speech,counternarratives,Mark 0: non appropiate; 1: not sure; 2: appropiate counteranrrative
0,If Muslims counteracted child rapists in their...,,
1,0,The quran says that muslims should not seek ho...,
2,0,Apart from all the scientists that are also mu...,
3,0,Why are you attacking muslims? Many people com...,
4,1,Intolerance of other peoples religious beliefs...,
5,0,Had lunch with some Muslim friends recently. S...,
6,1,This is not true. How can you say this about a...,
7,0,Invasion? Don't you think this expression is s...,
8,1,In fact at no point does the Quran tell people...,
9,0,Except all of those which live and add to West...,
