In [1]:
from types import SimpleNamespace

args = SimpleNamespace()

args.input_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
args.text_col = 'text'
args.text_id_col = 'text_id'
args.mention_col = 'mention'
args.mention_id_col = 'mention_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.skip_n = 0
args.sample_n = 300
args.chunk_size = 50
args.rank_by = 'informativeness_rank'
args.rank_higest_first = False
args.group_by = None

args.attributes_file = '../../data/annotations/group_mention_categorization/group_attributes.yaml'

# social group, coder training (batch 1)
# - args.input_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
# - args.skip_n = 0
# - args.sample_n = 300
# - args.output_file = '../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training/qualtrix_input.txt'
args.output_file = '../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training/qualtrix_input.txt'

In [2]:
import os
import yaml

import pandas as pd

In [3]:
df = pd.read_csv(args.input_file, sep='\t')

In [4]:
# # TODO: this should be in the data
# df['manifesto_id'] = df['mention_id'].str.split('-').str[0]

In [5]:
df['mention_id'] = df.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)

In [6]:
if args.group_by:
    pass
    # df = df.groupby(args.group_by)
    # df.ngroups
    # df.apply(lambda g: g.sample(args.sample_n)).reset_index(drop=True)
else:
    df.sort_values(by=args.rank_by, ascending=not args.rank_higest_first, inplace=True)

In [7]:
df = df.head(args.skip_n+args.sample_n)
df = df.tail(args.sample_n)

In [8]:
import numpy as np
df.prev_texts = df.prev_texts.replace(np.nan, '').str.split('\n')
df.next_texts = df.next_texts.replace(np.nan, '').str.split('\n')

### format for qualtrix

#### load and parse the attributes

In [9]:
fp = os.path.join(args.attributes_file)
with open(fp, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = {}
for dim, data  in ontology['social_group'].items():
    attrs = []
    for a, exs in data['attributes'].items():
        attrs.append( '<b>'+a+'</b>' + ' (e.g., ' + ', '.join(exs) + ', etc.)' if exs else a )
    attributes[data['display_name']] = attrs

In [10]:
# block = make_mention_categorization_block(
#     item_id = 'TEST',
#     sentence = 'Sentence containing a MENTION.',
#     mention = 'MENTION',
#     attributes = {'red color': ['apple', 'cherry'], 'blue color': ['sky', 'ocean']}
# )
# print(block)

In [11]:
from utils.qualtrix import *
from typing import List, Optional

def make_mention_categorization_block(
        item_id: str,
        sentence: str,
        mention: str,
        attributes: dict[str, list[str]],
        pre_context: Optional[List[str]]=None,
        post_context: Optional[List[str]]=None,
        as_block: bool=True,
        add_pagebreak: bool=False
) -> str:
    elements = [f"[[Block:{item_id}]]\n"] if as_block else ['[[PageBreak]]\n'] if add_pagebreak else []
    
    tmp = '<b>Below the sentence'
    if post_context or pre_context:
        tmp += ' with some context'
    tmp += '</b> (group mention marked bold):'
    sentence_data = [
        tmp,
        '<br>',
        '<br>',
        '<table>',
    ]
    if pre_context:
        pre = '<br>'.join(pre_context)
        sentence_data.append(f'  <tr><td><i>Prev. sentence(s)</i></td><td>{pre}</td></tr>')
    sentence_data.append(f'  <tr><td><i>Focal sentence</i></td><td>{sentence}</td></tr>')
    if post_context:
        post = '<br>'.join(post_context)
        sentence_data.append(f'  <tr><td><i>Next sentence(s)</i></td><td>{post}</td></tr>')
    sentence_data.append('</table>')
    
    # display sentence and mention
    q = only_text(
        question_text='\n'.join(sentence_data),
        question_id=f'{item_id}__text',
    )
    elements.append(q)
    # data quality
    q = matrix_question(
        question_text='Do you think the focal sentence has formatting issues or translation errors?',
        question_id=f'{item_id}__data_quality',
        choices=[
            '<b>Formatting issue(s)</b> (e.g., wrong sentence splitting, etc.)',
            '<b>Translation error(s)</b>'
        ],
        multiple_choice=True,
        answers={1: 'Yes', 99: '<i>Unsure</i>'},
    )
    elements.append(q)
    

    # universal attributes
    data = [
            f'<i>Sentence:</i> &ldquo;{sentence}&rdquo;',
            '<br>',
            '<br>',
            'Some group mentions use non-distinguishable attributes that we call universal (e.g., &ldquo;the people&rdquo;, &ldquo;consumers&rdquo;, &ldquo;humans&rdquo;, &ldquo;voters&rdquo;, etc.).',
            '<br>',
            '<br>',
            f'<b>Question:</b> Does the group mention &ldquo;<b>{mention}</b>&rdquo; use <b><i>universal attribute(s)</i></b>?',
        ]
    q = mc_question(
        question_text='\n'.join(data),
        question_id=f'{item_id}__universal_attributes',
        choices={1: 'Yes', 0: 'No', 99: '<i>Unsure</i>'},
        horizontal=True,
    )
    elements.append(q)

    # add attribute questions
    for a, choices in attributes.items():
        data = [
            f'<i>Sentence:</i> &ldquo;{sentence}&rdquo;',
            '<br>',
            '<br>',
            f'<b>Question:</b> Which <b><i>{a}</i></b> does the group mention &ldquo;<b>{mention}</b>&rdquo; name, if any?', 
            '<br>',
            '(multiple answers possible)'
        ]
        q = matrix_question(
            question_text='\n'.join(data),
            question_id=f'{item_id}__{a.replace(" ", "_")}',
            choices=choices,
            multiple_choice=True,
            answers={1: 'Yes', 99: '<i>Unsure</i>'},
        )
        elements.append(q)


    # # question for "other" attributes
    # tmp = '/'.join(list(attributes.keys()))
    # data = [
    #         f'<i>Sentence:</i> &ldquo;{sentence}&rdquo;',
    #         '<br>',
    #         '<br>',
    #         f'<b>Question:</b> Does the group mention &ldquo;<b>{mention}</b>&rdquo; name any <b><i>other attributes</i></b> that do not fit the {tmp} distinction?',
    #     ]
    # q = mc_question(
    #     question_text='\n'.join(data),
    #     question_id=f'{item_id}__other_attributes',
    #     choices={1: 'Yes', 0: 'No', 99: '<i>Unsure</i>'},
    #     horizontal=True,
    # )
    # elements.append(q)

    # stance
    sentence_data[0] = '<b>Here the <i>same</i> sentence again with some context:</b>' if post_context or pre_context else '<b>Here the <i>same</i> sentence again:</b>'
    sentence_data.extend([
        '<br>',
        '<br>',
        f'<b>Question:</b> How would your describe the sentence <b><i>author\'s stance</i></b> towards the group &ldquo;<b>{mention}</b>&rdquo;?'
    ])

    q = mc_question(
        question_text='\n'.join(sentence_data),
        question_id=f'{item_id}__stance',
        choices={-1: 'Negative', 0: 'Neutral', 1: 'Positive', 99: '<i>Unsure</i>'},
        horizontal=True,
    )
    elements.append(q)

    # open text field for comments 
    q = text_entry(
        question_text='Enter any comments you have here.',
        question_id=f'{item_id}__comments',
    )
    elements.append(q)

    return '\n'.join(elements)

In [12]:
def highlight_mention(text: str, s: int, e: int) -> str:
    return text[:s] + '<b>' + text[s:e] + '</b>' + text[e:]

def parse_row(row):
    mid = row['mention_id']
    s, e = row['start'], row['end']
    sentence = highlight_mention(row['text'], s, e)
    mention = row['mention']
    return make_mention_categorization_block(
        item_id=mid, 
        sentence=sentence, 
        mention=mention, 
        attributes=attributes,
        pre_context=row['prev_texts'] if len(row['prev_texts']) > 0 else None,
        post_context=row['next_texts'] if len(row['next_texts']) > 0 else None,
        as_block=False
    )

In [13]:
header = """
[[Block:intro]]

[[Question:Text]]
<h1>Welcome to the annotation task</h1>

<p>This is the online annotation tool for our <b>social group mention categorization</b> task.</p>
<br>
<h3>Your task</h3>

<p><i>Note:</i> Please refer to the <a href="">task instructions</a> for detailed guidliens how you should complete this task.</p>

<p>
  You will be presented with <b>a sentence that contains a group mention</b>. 
  Please read the sentence and try to understand what group is referred (see the highlighted words).
</p>
<br>
<p>
  Below the sentence, you will then have the opportunity to indicate data quality issues, such as apparent translation or sentence splitting errors.
  If you find that a sentence has such issues, please nevertheless try to answer all of the following coding questions outlined next.
</p>
<br>
<p>Next, you will be shown <b>four coding questions</b> with dedicated answer categories:</p>
<br>
<ol>
	<li>Does the group mention use <b><i>universal attribute(s)</i></b>?<br>(single choice)</li>
  <li>Which <b><i>economic attributes</i></b> does the group mention name, if any?<br>(multiple choices possible)</li>
  <li>Which <b><i>non-economic attributes</i></b> does the group mention name, if any?<br>(multiple choices possible)</li>
  <li>How would your describe the sentence <b><i>author's stance</i></b> towards the group?<br>(single choice)</li>
</ol>
<br>
<p>
  Select the answers that best describe your understanding of the mention's meaning in its sentence context.
  <i>Note:</i> For questions 2. and 3., you might select none, one, or multiple of the available categories.
</p>
<br>
<p>At the end you will find a free text field where you can add any comments you have.</p>
<br>
<p>If you have answered all questions for a mention, click on the "Next" button to progress to the next sentence.</p>

[[PageBreak]]

[[Question:TextEntry:SingleLine]]
[[ID:annotator_name]]
Please enter your name so that we know who's coded the sentences shown on the next pages.

[[Block:data]]

"""

In [14]:
blocks = df.apply(parse_row, axis=1)

In [15]:
import math
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)

args.chunk_size = 50
block_sep = '\n\n[[PageBreak]]\n\n'

pad = math.ceil(len(df) / args.chunk_size )
fmt = f'-{{i:0{len(str(pad))}d}}'

if args.chunk_size <= 0:
    out = header + block_sep.join(blocks)
    with open(args.output_file, 'w') as f:
        f.write(out)
else:
    add_header = True
    for i, idx in enumerate(range(0, len(blocks), args.chunk_size)):
        if add_header:
            out = '[[AdvancedFormat]]\n\n' + header + block_sep.join(blocks[idx:idx+args.chunk_size])
            add_header = False
        else:
            out = '[[AdvancedFormat]]\n\n' + block_sep.join(blocks[idx:idx+args.chunk_size])
        with open(args.output_file.replace('.txt', fmt.format(i=i)+'.txt'), 'w') as f:
            f.write(out)