In [49]:
from types import SimpleNamespace

args = SimpleNamespace()

args.input_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
args.text_col = 'text'
args.text_id_col = 'text_id'
args.mention_col = 'mention'
args.mention_id_col = 'mention_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.skip_n = 0
args.sample_n = 300
args.rank_by = 'informativeness_rank'
args.rank_higest_first = False
args.group_by = None

args.attributes_file = '../../data/annotations/group_mention_categorization/group_attributes.yaml'

args.model_name = 'meta/meta-llama-3-70b-instruct'

# social group, coder training (batch 1)
# - args.input_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
# - args.skip_n = 0
# - args.sample_n = 300
# - args.output_file = '../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training/qualtrix_input.txt'

In [5]:
import os
import yaml

import pandas as pd
from llms import PatientReplicateLlama3

In [50]:
model = PatientReplicateLlama3(model=args.model_name, system_prompt='Act as a helpful assistant')

In [8]:
df = pd.read_csv(args.input_file, sep='\t')

In [None]:
# # TODO: this should be in the data
# df['manifesto_id'] = df['mention_id'].str.split('-').str[0]

In [9]:
df['mention_id'] = df.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)

In [10]:
if args.group_by:
    pass
    # df = df.groupby(args.group_by)
    # df.ngroups
    # df.apply(lambda g: g.sample(args.sample_n)).reset_index(drop=True)
else:
    df.sort_values(by=args.rank_by, ascending=not args.rank_higest_first, inplace=True)

In [11]:
df = df.head(args.skip_n+args.sample_n)
df = df.tail(args.sample_n)

In [12]:
import numpy as np
df.prev_texts = df.prev_texts.replace(np.nan, '').str.split('\n')
df.next_texts = df.next_texts.replace(np.nan, '').str.split('\n')

### format for qualtrix

#### load and parse the attributes

In [13]:
fp = os.path.join(args.attributes_file)
with open(fp, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

#attributes = {}
#for dim, data  in ontology['social_group'].items():
#    attrs = []
#    for a, exs in data['attributes'].items():
#        attrs.append( '<b>'+a+'</b>' + ' (e.g., ' + ', '.join(exs) + ', etc.)' if exs else a )
#    attributes[data['display_name']] = attrs

In [35]:
prompt_template = """# Instructions

## Input and input format

You will be presented with a sentence from an election manifesto of a political party.
The sentence mentions a social group.
The group mention is already highlighted for you in the sentence.

Specifically, the sentence and mention texts will be shown in separate lines and enclosed in tripple quotes like this:

```
sentence: '''{{sentence text}}'''
mention: '''{{mention}}'''
```

## Task background and objective

To appeal to or talk about social groups, politicians name various attributes that clarify what kind of people they are talking about.
These attributes can relate to people's status (rich, poor) their socio-demographic characteristics (age, gender, employment status, etc.), their values or attitudes (“freedom-loving people”), their behavior (“those who work hard”, “honest people”), or any other kind of categorization used to distinguish some people from others ("those with the broadest shoulders").

We want to know what attributes are used to name, describe, or define the group being mentioned.

## Your task

Your task is to indicate whether or not the mention uses {task_definition} to define, describe, or refer to the social group.

For example, group mentions that use {task_definition} to define, describe, or refer to the social group include: {examples}

### Step-by-step instructions

1. **Read the sentence** and try to understand what group is referred to in the highlighted passage (the “group mention”).
2. **Analyze the mention** to understand what attributes are used to name, define, or describe the group.
3. **Code the mention:** Indicate with "Yes" or "No" whether or not the mention uses {task_definition} to define, describe, or refer to the social group.
4. **Explain your reasoning** in a single sentence.

## Response format

Return a JSON object using the following scheme:

{{"label": <"Yes" or "No">, "reasoning": "<your reasoning>"}}
"""

In [46]:
format_input = lambda text, mention: f"sentence: '''{text}'''\nmention: '''{mention}'''\n"
inputs = df.apply(lambda r: format_input(r[args.text_col], r[args.mention_col]), axis=1).to_list()

In [69]:
model.json_output = True

try:
    len(labels)
except NameError:
    labels = {}
for dim, attrs in ontology['social_group'].items():
    if dim not in labels:
        labels[dim] = {}
    for cat, exs in attrs['attributes'].items():
        if cat in labels[dim]:
            continue
        if cat.startswith('<i>'):
            continue
        print(f'processing "{dim}" > "{cat}"')
        examples = ', '.join(f'"{ex}"' for ex in exs) if exs else '<no examples provided>'
        prompt = prompt_template.format(
            task_definition=f'*{cat}* as an attribute',
            examples=examples 
        )
        model.system_prompt = prompt
        labels[dim][cat] = model(inputs, seed=1234, temperature=0.01)

processing "economic_attributes" > "class membership"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "economic_attributes" > "employment status"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "economic_attributes" > "education level"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "economic_attributes" > "income/wealth/economic status"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "economic_attributes" > "occupation/profession"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "economic_attributes" > "ecology of group"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "non_economic_attributes" > "age"


  0%|          | 0/300 [00:00<?, ?it/s]

processing "non_economic_attributes" > "family"


  0%|          | 0/300 [00:00<?, ?it/s]

ReadTimeout: The read operation timed out

In [57]:
model.system_prompt = prompt

In [64]:
model.json_output = True

  0%|          | 0/4 [00:00<?, ?it/s]

[{'label': 'No',
  'reasoning': "The mention 'People' is too general and does not specify any attribute related to income/wealth/economic status."},
 {'label': 'No',
  'reasoning': "The mention 'the people' does not use income/wealth/economic status as an attribute to define, describe, or refer to the social group."},
 {'label': 'No',
  'reasoning': "The mention 'the dominated' does not use income/wealth/economic status as an attribute to define, describe, or refer to the social group, but rather implies a power dynamic or social hierarchy."},
 {'label': 'No',
  'reasoning': "The mention 'Foreign educational personnel' does not use income/wealth/economic status as an attribute to define the group, but rather their nationality and occupation."}]