In [1]:
# TODO:
#  - decide which parts to export to module because they can be recycled

In [1]:
import os
import pandas as pd
import yaml
import json
from pprint import pprint

from llama_index.llms.ollama import Ollama
from llama_index.core.program import LLMTextCompletionProgram 

from pydantic import BaseModel, ValidationError
from typing import List, Literal

## Prepare the data

In [2]:
data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-round02'
fp = os.path.join(data_path, 'consolidated_annotations.tsv')
df = pd.read_csv(fp, sep='\t')

In [5]:
# # inspect stance annotations
# df.loc[df.attribute=='stance', 'label'].value_counts()
# df.loc[df.label=='Negative', ['text', 'mention']].drop_duplicates()

In [3]:
attributes = ['economic', 'non-economic']
df = df[df.attribute.isin(attributes)]

In [19]:
# inspect distribution of econ attributes
tmp = df[df.attribute==attributes[0]].value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp.fillna(0, inplace=True)
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)
tmp['prevalence'] = tmp['Yes'] / tmp['n']
tmp.round(3)

label,No,Yes,n,prevalence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
class membership,421.0,28.0,449.0,0.062
ecology of group,432.0,17.0,449.0,0.038
education level,435.0,14.0,449.0,0.031
employment status,403.0,46.0,449.0,0.102
income/wealth/economic status,403.0,46.0,449.0,0.102
occupation/profession,364.0,85.0,449.0,0.189
other,449.0,0.0,449.0,0.0


In [20]:
# inspect distribution of non-econ attributes
tmp = df[df.attribute==attributes[1]].value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp.fillna(0, inplace=True)
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)
tmp['prevalence'] = tmp['Yes'] / tmp['n']
tmp.round(3)

label,No,Yes,n,prevalence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,384,65,449,0.145
crime,434,15,449,0.033
ethnicity,442,7,449,0.016
family,410,39,449,0.087
gender/sexuality,438,11,449,0.024
health,428,21,449,0.047
nationality,383,66,449,0.147
other,446,3,449,0.007
place/location,438,11,449,0.024
religion,444,5,449,0.011


In [393]:
tmp = df[df.attribute==attributes[1]].value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)
tmp

label,No,Yes,n
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
class membership,421.0,28.0,449.0
ecology of group,432.0,17.0,449.0
education level,435.0,14.0,449.0
employment status,403.0,46.0,449.0
income/wealth/economic status,403.0,46.0,449.0
occupation/profession,364.0,85.0,449.0
other,449.0,,449.0


### format annotations into format suitable for analyses

In [80]:
# gather labels and pivot wider
exemplars = df.groupby(['mention_id', 'mention', 'attribute']).\
    apply(lambda x: '; '.join(x.category[x.label=='Yes'].tolist()), include_groups=False).\
    reset_index().\
    rename(columns={0: 'labels'}).\
    pivot(index=['mention_id', 'mention'], columns='attribute', values='labels').\
    reset_index()

In [81]:
# discard mentions with disagreeing labels on any of the attributes
has_disagreements = lambda x : (x[attributes].apply(lambda a: a.nunique(), axis=0)>1).any()
disagreements = exemplars.groupby('mention').apply(has_disagreements, include_groups=False)
exemplars = exemplars[~exemplars.mention.isin(disagreements[disagreements].index)]

In [82]:
# drop duplicates
exemplars = exemplars[['mention'] + attributes].drop_duplicates()

In [83]:
# split into list of labels
exemplars.loc[:, attributes] = exemplars[attributes].apply(lambda x: x.str.split('; '))
for a in attributes:
    exemplars.loc[:, a] = exemplars[a].apply(lambda x: [e for e in x if e!=''])

## Prepare the prompt template

In [85]:
attributes_file = '../../../data/annotations/group_mention_categorization/group_attributes_v2.yaml'
with open(attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

scheme = {}
for dim, d in ontology['social_group'].items():
    scheme[d['display_name']] = {
        'other ' + d['display_name'][:-1] if k.startswith('<i>Other') else k:
        '"' + '", "'.join(v) + '", etc.' if v else None
        for k, v in d['attributes'].items()
    }

In [86]:
pprint(scheme['economic attributes'])

{'class membership': '"upper class", "middle class", "lower class", "working '
                     'class", etc.',
 'ecology of group': '"coal miners", "green employers", "green workers", '
                     '"sustainable farmers", "those working in the fossil '
                     'sector", etc.',
 'education level': '"students", "apprentices", "graduates", etc.',
 'employment status': '"employers", "employees", "self-employed", '
                      '"unemployed", etc.',
 'income/wealth/economic status': '"high/medium/low income", "rich/poor", '
                                  '"homeowners/tenants/homeless", etc.',
 'occupation/profession': '"teachers", "farmers", "public servants", "police '
                          'officers", etc.',
 'other economic attribute': None}


In [87]:
# see template examples: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
prompt_template = """\
Act as a multilabel classifier for social group attributes.

Below you will see a text that refers to a social group or a collective of people.

Your task is to classify which {attributes_dim} are explicitly used to define or describe the mentioned group, if any. Choose from the following categories:

{attributes_list}

Return your response as a JSON list of strings, where each string is a category that is used in the text to define or describe the mentioned group.
If you think that the group is not described by any of these {attributes_dim}, return an empty list `[]`.

INPUT: \"\"\"{mention}\"\"\"
RESPONSE:\
"""

def create_attributes_list(atrributes_dict):
    return '\n'.join([f'- **{k}**: {v}' if v else f'- **{k}**' for k, v in atrributes_dict.items()])

prompt_templates = {
    dim:
    prompt_template.format(
        attributes_dim=dim, 
        attributes_list=create_attributes_list(attrs),
        mention='{mention}'
    )
    for dim, attrs in scheme.items()
}

In [89]:
# init LLM
llm = Ollama(
    model='phi4',
    temperature=0.0,
    seed=42,
    json_mode=True
)

In [92]:
# Example
llm.complete('How is the whether today?')

CompletionResponse(text='{  \n  "response": "I\'m not able to provide real-time weather updates. However, you can check the current weather by using a weather app or visiting a reliable weather website for your location."  \n}', additional_kwargs={'tool_calls': []}, raw={'model': 'phi4', 'created_at': '2025-01-19T12:29:34.178508Z', 'done': True, 'done_reason': 'stop', 'total_duration': 20563800917, 'load_duration': 1062888625, 'prompt_eval_count': 16, 'prompt_eval_duration': 16335000000, 'eval_count': 44, 'eval_duration': 3163000000, 'message': Message(role='assistant', content='{  \n  "response": "I\'m not able to provide real-time weather updates. However, you can check the current weather by using a weather app or visiting a reliable weather website for your location."  \n}', images=None, tool_calls=None), 'usage': {'prompt_tokens': 16, 'completion_tokens': 44, 'total_tokens': 60}}, logprobs=None, delta=None)

In [93]:
dim = 'economic attributes'

Category = Literal[*list(scheme[dim].keys())]
class Categories(BaseModel):
    """Data model for selected categories (if any)"""
    categories: List[Category]

econ_attributes_classifier = LLMTextCompletionProgram.from_defaults(
    output_cls=Categories,
    prompt_template_str=prompt_templates[dim],
    llm=llm,
    verbose=True,
)

In [94]:
dim = 'non-economic attributes'

Category = Literal[*list(scheme[dim].keys())]
class Categories(BaseModel):
    """Data model for selected categories (if any)"""
    categories: List[Category]

nonecon_attributes_classifier = LLMTextCompletionProgram.from_defaults(
    output_cls=Categories,
    prompt_template_str=prompt_templates[dim],
    llm=llm,
    verbose=True,
)

In [96]:
ex = exemplars.iloc[0]
print('"""'+ex.mention+'"""')
print(' -     econ:', ex['economic'], '(humans);', econ_attributes_classifier(mention=ex.mention).categories, '(llm)')
print(' - non-econ:', ex['non-economic'], '(humans);', nonecon_attributes_classifier(mention=ex.mention).categories, '(llm)')

"""parents"""
 -     econ: [] (humans); [] (llm)
 - non-econ: ['family'] (humans); ['family'] (llm)


In [113]:
from tqdm.auto import tqdm

llm_annotations = {a: {} for a in attributes}

for i, ex in tqdm(exemplars.iterrows(), total=len(exemplars)):
    # econ attributes
    try:
        llm_annotations['economic'][i] = econ_attributes_classifier(mention=ex.mention).categories
    except ValidationError:
        pass
    
    # non-economic attributes
    try:
        llm_annotations['non-economic'][i] = nonecon_attributes_classifier(mention=ex.mention).categories
    except ValidationError:
        pass

# TODO: check how to handle validation errors in pydantic 
#       (see https://docs.pydantic.dev/2.9/errors/validation_errors 
#        and llama_index.core.types.BaseOutputParser class)

100%|██████████| 388/388 [1:18:08<00:00, 12.08s/it]


In [139]:
exemplars = exemplars.\
    join(pd.DataFrame(pd.Series(llm_annotations['economic']), columns=['economic_pred'])).\
    join(pd.DataFrame(pd.Series(llm_annotations['non-economic']), columns=['non-economic_pred']))

### Compute agreement

In [155]:
# one-hot encode annotations
dim = 'economic'
human = exemplars[dim].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])
llm = exemplars[dim+'_pred'].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])

import numpy as np
human = np.stack(human.values)
llm = np.stack(llm.values)

# compute f1 score for multilabel classification
from sklearn.metrics import classification_report

print(classification_report(human, llm, target_names=scheme[dim+' attributes'].keys(), zero_division=0.0))

                               precision    recall  f1-score   support

             class membership       0.39      0.35      0.37        20
            employment status       0.63      0.78      0.70        37
              education level       0.71      0.71      0.71        14
income/wealth/economic status       0.76      0.70      0.73        46
        occupation/profession       0.93      0.65      0.77        78
             ecology of group       0.69      0.64      0.67        14
     other economic attribute       0.00      0.00      0.00         0

                    micro avg       0.72      0.66      0.69       209
                    macro avg       0.59      0.55      0.56       209
                 weighted avg       0.76      0.66      0.70       209
                  samples avg       0.30      0.32      0.30       209



In [156]:
# one-hot encode annotations
dim = 'non-economic'
human = exemplars[dim].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])
llm = exemplars[dim+'_pred'].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])

import numpy as np
human = np.stack(human.values)
llm = np.stack(llm.values)

# compute f1 score for multilabel classification
from sklearn.metrics import classification_report

print(classification_report(human, llm, target_names=scheme[dim+' attributes'].keys(), zero_division=0.0))

                              precision    recall  f1-score   support

                         age       0.84      0.86      0.85        49
                      family       0.92      0.71      0.80        31
            gender/sexuality       0.69      0.82      0.75        11
              place/location       0.37      0.64      0.47        11
                 nationality       0.98      0.63      0.77        63
                   ethnicity       0.50      0.50      0.50         6
                    religion       0.25      0.20      0.22         5
                      health       0.75      0.60      0.67        20
                       crime       0.65      0.79      0.71        14
   shared values/mentalities       0.79      0.34      0.48        67
other non-economic attribute       0.00      0.00      0.00         0

                   micro avg       0.76      0.61      0.68       277
                   macro avg       0.61      0.55      0.56       277
                we

## Export disagreement cases

In [157]:
for dim in attributes:
    exemplars[dim] = exemplars[dim].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])
    exemplars[dim+'_pred'] = exemplars[dim+'_pred'].apply(lambda x: [int(a in x) for a in scheme[dim+' attributes'].keys()])

In [165]:
idxs = np.logical_or(exemplars['economic'] != exemplars['economic_pred'], exemplars['non-economic'] != exemplars['non-economic_pred'])

In [194]:
tmp = exemplars[idxs].copy()
for dim in attributes:
    tmp.loc[:, dim+'_category'] = [list(scheme[dim+' attributes'].keys())]*len(tmp)
# pivot longer
tmp = tmp.melt(id_vars=['mention'], var_name='attribute', value_name='labels')
tmp[['attribute', 'source']] = tmp.attribute.str.split('_', expand=True).replace({None: 'annotators', 'pred': 'llm'})
# pivot wider
tmp = tmp.pivot(index=['mention', 'attribute'], columns='source', values='labels').reset_index()
# explode
tmp = tmp.explode(['category', 'annotators', 'llm', ])

In [195]:
# NOTE: can be subset to ~280 disagreement cases
tmp[tmp.annotators!=tmp.llm]

source,mention,attribute,annotators,category,llm
0,those who lead them,economic,1,occupation/profession,0
3,A multicultural society,non-economic,1,nationality,0
3,A multicultural society,non-economic,1,religion,0
3,A multicultural society,non-economic,1,shared values/mentalities,0
4,A professional group that sees the enormous in...,economic,1,occupation/profession,0
...,...,...,...,...,...
396,working households,economic,0,income/wealth/economic status,1
399,young people in Slovakia,non-economic,0,place/location,1
399,young people in Slovakia,non-economic,1,nationality,0
401,young people who are unable or unwilling to in...,non-economic,0,health,1


In [203]:
out = df[['mention_id', 'text', 'mention']].drop_duplicates().merge(tmp[['mention', 'attribute', 'category', 'annotators', 'llm']], how='inner', on='mention')

In [207]:
out.query('annotators!=llm').sort_values(['mention', 'mention_id']).to_clipboard()

In [208]:
data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-llm-review'
fp = os.path.join(data_path, 'cases.tsv')
out.to_csv(fp, sep='\t', index=False)