In [233]:
import os
import numpy as np
import pandas as pd

In [234]:
data_path = '../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training'

In [235]:
fp = os.path.join(data_path, 'parsed',  'reviewed_examples.tsv')
reviewed = pd.read_csv(fp, sep='\t')

In [236]:
reviewed.q_id.value_counts()

q_id
non-economic_attributes    109
economic_attributes         92
universal_attributes        61
stance                      50
Name: count, dtype: int64

## Parse the expert annotations

### parse 'universal' annotations

In [237]:
reviewed[reviewed.q_id != 'stance'].universal.value_counts(dropna=False)

universal
.      250
Yes     12
Name: count, dtype: int64

In [238]:
universal_df = reviewed.loc[reviewed.q_id != 'stance', ['mention_id', 'universal']].drop_duplicates()
universal_df.rename(columns={'universal': 'value'}, inplace=True)
universal_df.value.value_counts()
universal_df['value'] = universal_df.value.apply(lambda x: 'Yes' if x == 'Yes' else 'No')
universal_df.drop_duplicates(inplace=True)
universal_df['q_id'] = 'universal_attributes'
universal_df

Unnamed: 0,mention_id,value,q_id
0,11110_199109-390940-1,Yes,universal_attributes
2,11110_200609-393907-1,No,universal_attributes
5,11110_201009-394992-1,No,universal_attributes
6,11110_201809-399481-1,No,universal_attributes
9,11110_201809-399601-2,Yes,universal_attributes
...,...,...,...
299,96710_199206-378041-2,No,universal_attributes
303,96710_199809-378834-1,No,universal_attributes
304,96710_199809-378937-1,No,universal_attributes
306,96725_201603-381934-1,No,universal_attributes


In [239]:
mention_ids = universal_df.groupby('mention_id').size()
mention_ids[mention_ids > 1] # must be empty !!!

Series([], dtype: int64)

### parse econ annotations

In [240]:
reviewed[reviewed.q_id != 'stance'].econ.value_counts(dropna=False)

econ
.                                                             145
income/wealth/economic status                                  29
employment status                                              25
occupation/profession                                          20
ecology of group                                               11
education level                                                 7
employment status, class membership                             6
employment status, occupation/profession                        4
employment status, ecology of group, occupation/profession      3
income/wealth/economic status, class membership                 3
class membership                                                3
income/wealth/economic status, occupation/profession            2
Ecology of group                                                2
occupation/profession, ecology of group                         1
ecology of group, occupation/profession                         1
Name:

In [241]:
econ_df = reviewed.loc[reviewed.q_id != 'stance', ['mention_id', 'econ']]
econ_df = econ_df[econ_df.econ != '.'].reset_index(drop=True)
# split values in 'econ' into rows at comma
econ_df = econ_df.assign(econ=econ_df.econ.str.split(',\s*', regex=True)).explode('econ')
econ_df['econ'] = econ_df.econ.str.lower()
econ_df.drop_duplicates(inplace=True)
econ_df.rename(columns={'econ': 'category'}, inplace=True)
econ_df['q_id'] = 'economic_attributes'
econ_df['value'] = 'Yes'
econ_df.category.value_counts()

category
occupation/profession            20
employment status                18
income/wealth/economic status    17
ecology of group                 10
class membership                  6
education level                   3
Name: count, dtype: int64

In [242]:
# map category names to category IDs
econ_cat2id = {
    "class membership": 1,
    "employment status": 2,
    "education level": 3,
    "income/wealth/economic status": 4,
    "occupation/profession": 5,
    "ecology of group": 6,
    "other": 7
}

econ_df['q_category'] = econ_df.category.map(econ_cat2id)
econ_df[['q_category', 'category']].value_counts(dropna=False)

q_category  category                     
5           occupation/profession            20
2           employment status                18
4           income/wealth/economic status    17
6           ecology of group                 10
1           class membership                  6
3           education level                   3
Name: count, dtype: int64

In [243]:
econ_negative_decision = reviewed.loc[reviewed.q_id != 'stance', ['mention_id', 'econ']].drop_duplicates()
econ_negative_decision = econ_negative_decision[econ_negative_decision.econ == '.'].reset_index(drop=True)
econ_negative_decision['q_id'] = 'economic_attributes'
econ_negative_decision['category'] = [list(econ_cat2id.keys())]*len(econ_negative_decision)
econ_negative_decision = econ_negative_decision.explode('category').reset_index(drop=True)
econ_negative_decision['q_category'] = econ_negative_decision.category.map(econ_cat2id)
econ_negative_decision['label'] = 'No'
del econ_negative_decision['econ']

### parse non-econ annotations

In [244]:
reviewed[reviewed.q_id != 'stance']['non-econ'].value_counts(dropna=False)

non-econ
.                                                              106
nationality                                                     47
shared values/mentalities                                       43
age                                                             17
gender/sexuality                                                 9
family                                                           8
place/location                                                   6
nationality, shared values/mentalities                           4
crime                                                            4
nationality, ethnicity, religion, shared values/mentalities      3
health                                                           3
religion                                                         2
age, family                                                      2
ethnicity, nationality, religion, shared values/mentalities      2
place/location, nationality                          

In [245]:
nonecon_df = reviewed.loc[reviewed.q_id != 'stance', ['mention_id', 'non-econ']].rename(columns={'non-econ': 'category'})
nonecon_df = nonecon_df[nonecon_df.category != '.'].reset_index(drop=True)
nonecon_df = nonecon_df.assign(category=nonecon_df.category.str.split(',\s*', regex=True)).explode('category')
nonecon_df['category'] = nonecon_df.category.str.lower()
nonecon_df.drop_duplicates(inplace=True)
nonecon_df['q_id'] = 'non-economic_attributes'
nonecon_df['value'] = 'Yes'
nonecon_df.category.value_counts()

category
nationality                  33
shared values/mentalities    30
age                          12
family                        6
gender/sexuality              6
place/location                5
ethnicity                     3
religion                      3
health                        2
crime                         2
other                         1
Name: count, dtype: int64

In [246]:
# map category names to category IDs
nonecon_cat2id = {
    "age": 1,
    "family": 2, 
    "gender/sexuality": 3, 
    "place/location": 4,
    "nationality": 5,
    "ethnicity": 6,
    "religion": 7,
    "health": 8,
    "crime": 9,
    # other
    "other": 10,
    "shared values/mentalities": 11, # NOTE added after first round
}

nonecon_df['q_category'] = nonecon_df.category.map(nonecon_cat2id)

nonecon_df[['q_category', 'category']].value_counts(dropna=False)

q_category  category                 
5           nationality                  33
11          shared values/mentalities    30
1           age                          12
2           family                        6
3           gender/sexuality              6
4           place/location                5
6           ethnicity                     3
7           religion                      3
8           health                        2
9           crime                         2
10          other                         1
Name: count, dtype: int64

In [247]:
nonecon_negative_decision = reviewed.loc[reviewed.q_id != 'stance', ['mention_id', 'non-econ']].rename(columns={'non-econ': 'category'}).drop_duplicates()
nonecon_negative_decision = nonecon_negative_decision[nonecon_negative_decision.category == '.'].reset_index(drop=True)
nonecon_negative_decision['q_id'] = 'non-economic_attributes'
nonecon_negative_decision['category'] = [list(nonecon_cat2id.keys())]*len(nonecon_negative_decision)
nonecon_negative_decision = nonecon_negative_decision.explode('category').reset_index(drop=True)
nonecon_negative_decision['q_category'] = nonecon_negative_decision.category.map(nonecon_cat2id)
nonecon_negative_decision['label'] = 'No'

### parse 'stance' annotations

In [248]:
reviewed[reviewed.q_id == 'stance'].stance.value_counts(dropna=False)

stance
positive    28
neutral     11
negative    10
unsure       1
Name: count, dtype: int64

In [249]:
stance_df = reviewed.loc[reviewed.q_id == 'stance', ['mention_id', 'q_id', 'stance']].rename(columns={'stance': 'value'})
stance_df.value = stance_df.value.str.title()
stance_df.value.value_counts()

value
Positive    28
Neutral     11
Negative    10
Unsure       1
Name: count, dtype: int64

## Join with agreement cases

In [250]:
fp = os.path.join(data_path, 'parsed',  'gold_examples.tsv')
agreement_cases_df = pd.read_csv(fp, sep='\t')

In [251]:
agreement_cases_df[agreement_cases_df.q_id == 'economic_attributes'].category.value_counts()

category
class membership                 234
employment status                234
education level                  234
income/wealth/economic status    234
occupation/profession            234
ecology of group                 234
other                            234
Name: count, dtype: int64

In [252]:
econ_df_expanded = econ_df.groupby(['mention_id', 'q_id']).category.apply(lambda x: {c: 'Yes' if c in x.tolist() else 'No' for c in econ_cat2id}).reset_index()
econ_df_expanded.rename(columns={'category': 'label', 'level_2': 'category'}, inplace=True)
econ_df_expanded = pd.concat([econ_df_expanded, econ_negative_decision[econ_df_expanded.columns]], ignore_index=True).reset_index(drop=True)
econ_df_expanded[['category', 'label']].value_counts(dropna=False).unstack().reset_index()

label,category,No,Yes
0,class membership,143.0,6.0
1,ecology of group,139.0,10.0
2,education level,146.0,3.0
3,employment status,131.0,18.0
4,income/wealth/economic status,132.0,17.0
5,occupation/profession,129.0,20.0
6,other,149.0,


In [253]:
nonecon_df_expanded = nonecon_df.groupby(['mention_id', 'q_id']).category.apply(lambda x: {c: 'Yes' if c in x.tolist() else 'No' for c in nonecon_cat2id}).reset_index()
nonecon_df_expanded.rename(columns={'category': 'label', 'level_2': 'category'}, inplace=True)
nonecon_df_expanded = pd.concat([nonecon_df_expanded, nonecon_negative_decision[nonecon_df_expanded.columns]], ignore_index=True).reset_index(drop=True)
nonecon_df_expanded[['category', 'label']].value_counts(dropna=False).unstack().reset_index()

label,category,No,Yes
0,age,137,12
1,crime,147,2
2,ethnicity,146,3
3,family,143,6
4,gender/sexuality,143,6
5,health,147,2
6,nationality,116,33
7,other,148,1
8,place/location,144,5
9,religion,146,3


In [254]:
# # discard 
# agreement_cases_df = agreement_cases_df.merge(universal_df[key_cols[:2]], how='left', indicator=True)
# agreement_cases_df = agreement_cases_df[agreement_cases_df._merge == 'left_only'].drop(columns='_merge')
# 
# agreement_cases_df = agreement_cases_df.merge(econ_df_expanded[key_cols], how='left', indicator=True)
# agreement_cases_df = agreement_cases_df[agreement_cases_df._merge == 'left_only'].drop(columns='_merge')
# 
# agreement_cases_df = agreement_cases_df.merge(nonecon_df_expanded[key_cols], how='left', indicator=True)
# agreement_cases_df = agreement_cases_df[agreement_cases_df._merge == 'left_only'].drop(columns='_merge')
# 
# agreement_cases_df = agreement_cases_df.merge(stance_df[key_cols[:2]], how='left', indicator=True)
# agreement_cases_df = agreement_cases_df[agreement_cases_df._merge == 'left_only'].drop(columns='_merge')

In [255]:
df = pd.concat(
    [
        agreement_cases_df[['mention_id', 'q_id', 'category', 'label']].assign(source='coders'),
        universal_df.rename(columns={'value': 'label'}).assign(source='expert'), 
        econ_df_expanded.assign(source='expert'), 
        nonecon_df_expanded.assign(source='expert'),
        stance_df.rename(columns={'value': 'label'}).assign(source='expert')
    ],
    ignore_index=True
)

df['q_category'] = np.nan
df.loc[df.q_id == 'economic_attributes', 'q_category'] = df.loc[df.q_id == 'economic_attributes'].category.map(econ_cat2id)
df.loc[df.q_id == 'non-economic_attributes', 'q_category'] = df.loc[df.q_id == 'non-economic_attributes'].category.map(nonecon_cat2id)
df = df.sort_values(['mention_id', 'q_id', 'q_category']).reset_index(drop=True)

In [256]:
df.mention_id.nunique()

300

In [257]:
# check ---
for d in df.q_id.unique(): 
    print(d, ':', df[df.q_id == d].groupby('mention_id').ngroups)
df.head()

economic_attributes : 300
non-economic_attributes : 300
stance : 300
universal_attributes : 300


Unnamed: 0,mention_id,q_id,category,label,source,q_category
0,11110_198809-390636-1,economic_attributes,class membership,No,coders,1.0
1,11110_198809-390636-1,economic_attributes,employment status,No,coders,2.0
2,11110_198809-390636-1,economic_attributes,education level,No,coders,3.0
3,11110_198809-390636-1,economic_attributes,income/wealth/economic status,No,coders,4.0
4,11110_198809-390636-1,economic_attributes,occupation/profession,No,coders,5.0


In [258]:
df.loc[df.category.isna(), 'category'] = ''
df.loc[df.q_category.isna(), 'q_category'] = -1.0

### post-review disagreements

In [259]:
# see if there is disagreement between coders agreement labels and expert annotation
key_cols = ['mention_id', 'q_id', 'category']
tmp = df.groupby(key_cols).agg({'label': lambda x: len(set(x)) > 1}).reset_index()
tmp[tmp.label] # yes, a few =(

Unnamed: 0,mention_id,q_id,category,label
1227,14820_197201-197410-1,universal_attributes,,True
1516,15111_201710-281205-1,universal_attributes,,True
1580,21111_198111-22981-1,economic_attributes,occupation/profession,True
1662,21111_199111-32560-1,non-economic_attributes,ethnicity,True
1666,21111_199111-32560-1,non-economic_attributes,nationality,True
1669,21111_199111-32560-1,non-economic_attributes,religion,True
1672,21111_199111-32560-1,universal_attributes,,True
2084,22110_201703-327644-2,universal_attributes,,True
2186,23113_199406-295593-1,economic_attributes,income/wealth/economic status,True
2579,41112_199012-126399-1,non-economic_attributes,crime,True


In [260]:
disagreements = tmp.loc[tmp.label, key_cols].merge(df, on=key_cols, how='left')
# pivot wider values from label to columns using source
disagreements = disagreements.pivot(index=key_cols, columns='source', values='label').reset_index()
# remove name of index
disagreements.columns.name = None
disagreements = disagreements.merge(agreement_cases_df[['mention_id', 'text', 'mention']].drop_duplicates(), on='mention_id', how='left')

In [261]:
fp = os.path.join(data_path, 'parsed',  'post_review_disagreements.tsv')
if not os.path.exists(fp):
    disagreements.to_csv(fp, sep='\t', index=False)

In [262]:
# read after manual review
fp = os.path.join(data_path, 'parsed',  'post_review_disagreements_resolved.tsv')
disagreements_resolved = pd.read_csv(fp, sep='\t')
disagreements_resolved = disagreements_resolved[disagreements_resolved['round'] == 6]

In [263]:
disagreements_resolved.loc[disagreements_resolved.category.isna(), 'category'] = ''

In [264]:
# add q_category to disagreements_resolved
disagreements_resolved = disagreements_resolved.merge(df[key_cols+['q_category']].drop_duplicates(), on=key_cols, how='left')

# bring into format of df
disagreements_resolved = disagreements_resolved[key_cols+['decision', 'q_category']]
disagreements_resolved.rename(columns={'decision': 'label'}, inplace=True)
disagreements_resolved['source'] = 'expert'

In [265]:
# remove disagreement cases from df (using anti-join)
df = df.merge(disagreements_resolved[key_cols], on=key_cols, how='left', indicator=True)
df = df[df._merge == 'left_only'].drop(columns='_merge')

# add resolved disagreements into df
df = pd.concat([df, disagreements_resolved], ignore_index=True)
df = df.sort_values(key_cols).reset_index(drop=True)

In [266]:
# verify that resolutin workd
tmp = df.groupby(key_cols).agg({'label': lambda x: len(set(x)) > 1}).reset_index()
tmp[tmp.label] # should be none!

Unnamed: 0,mention_id,q_id,category,label


In [267]:
# check ---
for d in df.q_id.unique(): 
    print(d, ':', df[df.q_id == d].groupby('mention_id').ngroups)
df.head()

economic_attributes : 300
non-economic_attributes : 300
stance : 300
universal_attributes : 300


Unnamed: 0,mention_id,q_id,category,label,source,q_category
0,11110_198809-390636-1,economic_attributes,class membership,No,coders,1.0
1,11110_198809-390636-1,economic_attributes,ecology of group,No,coders,6.0
2,11110_198809-390636-1,economic_attributes,education level,No,coders,3.0
3,11110_198809-390636-1,economic_attributes,employment status,No,coders,2.0
4,11110_198809-390636-1,economic_attributes,income/wealth/economic status,No,coders,4.0


In [268]:
# drop duplicates (keeping expert annotations)
df.loc[df.category.isna(), 'category'] = ''
df.loc[df.q_category.isna(), 'q_category'] = -1.0
df = df.groupby(df.columns.drop('source').to_list()).agg({'source': 'max'}).reset_index()

In [269]:
df.source.value_counts()

source
coders    2968
expert    2881
Name: count, dtype: int64

In [270]:
# check ---
for d in df.q_id.unique(): 
    print(d, ':', df[df.q_id == d].groupby('mention_id').ngroups)
df.head()

economic_attributes : 300
non-economic_attributes : 300
stance : 300
universal_attributes : 300


Unnamed: 0,mention_id,q_id,category,label,q_category,source
0,11110_198809-390636-1,economic_attributes,class membership,No,1.0,coders
1,11110_198809-390636-1,economic_attributes,ecology of group,No,6.0,coders
2,11110_198809-390636-1,economic_attributes,education level,No,3.0,coders
3,11110_198809-390636-1,economic_attributes,employment status,No,2.0,coders
4,11110_198809-390636-1,economic_attributes,income/wealth/economic status,No,4.0,coders


### re-review "universal" cases to check if they mention 'shared values/mentalities'

In [271]:
# see if there is disagreement between coders agreement labels and expert annotation
tmp = df.loc[np.logical_and(df.q_id == 'universal_attributes', df.label == 'Yes'), ['mention_id', 'source']]

tmp = tmp.merge(agreement_cases_df[['mention_id', 'text', 'mention']].drop_duplicates(), how='left')

fp = os.path.join(data_path, 'parsed',  'universal_mentions.tsv')
if not os.path.exists(fp):
    tmp[['mention_id', 'text', 'mention', 'source']].to_csv(fp, sep='\t', index=False)

In [272]:
fp = os.path.join(data_path, 'parsed',  'universal_mentions_resolved.tsv')
resolved = pd.read_csv(fp, sep='\t')
resolved = tmp[['mention_id']].merge(resolved, how='left')

In [273]:
resolved['q_id'] = 'non-economic_attributes'
resolved['category'] = 'shared values/mentalities'
resolved['q_category'] = resolved.category.map(nonecon_cat2id).astype(float)
resolved['source'] = 'expert'
resolved.rename(columns={'mentions_shared_values_mentalities': 'label'}, inplace=True)

In [274]:
# mid = '11110_199109-390940-1'
# mid = '13951_199803-188190-1'
# resolved[np.logical_and(resolved.mention_id == mid, resolved.q_id == 'non-economic_attributes')]

In [275]:
# expand df by cases where "non-econ: shared values/mentalities" is True
df = df.merge(resolved[key_cols], on=key_cols, how='left', indicator=True)
df = df[df._merge == 'left_only'].drop(columns='_merge')
df = pd.concat([df, resolved[df.columns]], ignore_index=True).sort_values(key_cols).reset_index(drop=True).drop_duplicates()

In [276]:
for d in df.q_id.unique():
    tmp = df[df.q_id == d].groupby(['mention_id', 'category']).size().reset_index()
    print(d, len(tmp[tmp[0] > 1]))

economic_attributes 0
non-economic_attributes 0
stance 0
universal_attributes 0


In [277]:
# expand df with rows "non-econ: shared values/mentalities" is False where this attribute is missing
tmp = df[df.q_id == 'non-economic_attributes'].groupby('mention_id').agg({'category': lambda x: 'shared values/mentalities' in x.tolist()}).reset_index()
tmp = tmp.loc[~tmp.category, ['mention_id']]
tmp['q_id'] = 'non-economic_attributes'
tmp['category'] = 'shared values/mentalities'
tmp['q_category'] = float(nonecon_cat2id['shared values/mentalities'])
tmp['source'] = 'expert'
tmp['label'] = 'No'

df = pd.concat([df, tmp], ignore_index=True)
df = df.sort_values(key_cols).reset_index(drop=True)

In [278]:
df[df.q_id == 'non-economic_attributes'].category.value_counts()

category
age                          300
crime                        300
ethnicity                    300
family                       300
gender/sexuality             300
health                       300
nationality                  300
other                        300
place/location               300
religion                     300
shared values/mentalities    300
Name: count, dtype: int64

In [279]:
# check ---
for d in df.q_id.unique(): 
    print(d, ':', df[df.q_id == d].groupby('mention_id').ngroups)

economic_attributes : 300
non-economic_attributes : 300
stance : 300
universal_attributes : 300


In [280]:
df.q_id.value_counts(dropna=False)

q_id
non-economic_attributes    3300
economic_attributes        2100
stance                      300
universal_attributes        300
Name: count, dtype: int64

In [281]:
agg = df[df.q_id != 'stance'].groupby(['mention_id', 'q_id']).agg({'label': lambda x: 'Yes' in x.tolist(), 'source': 'max'}).reset_index()
agg = agg.pivot(index='mention_id', columns='q_id', values='label').reset_index()
agg.columns.name = None

In [282]:
agg[['universal_attributes', 'economic_attributes', 'non-economic_attributes']].value_counts(dropna=False, sort=False)

universal_attributes  economic_attributes  non-economic_attributes
False                 False                True                       145
                      True                 False                       99
                                           True                        15
True                  False                False                       41
Name: count, dtype: int64

In [283]:
agg[np.logical_and(agg.universal_attributes, agg.economic_attributes)] # should be empty

Unnamed: 0,mention_id,economic_attributes,non-economic_attributes,universal_attributes


In [284]:
agg[np.logical_and(agg.universal_attributes, agg['non-economic_attributes'])] # should be empty

Unnamed: 0,mention_id,economic_attributes,non-economic_attributes,universal_attributes


## Output combined

In [285]:
df[['q_id', 'category', 'q_category']].value_counts(dropna=False, sort=False)

q_id                     category                       q_category
economic_attributes      class membership                1.0          300
                         ecology of group                6.0          300
                         education level                 3.0          300
                         employment status               2.0          300
                         income/wealth/economic status   4.0          300
                         occupation/profession           5.0          300
                         other                           7.0          300
non-economic_attributes  age                             1.0          300
                         crime                           9.0          300
                         ethnicity                       6.0          300
                         family                          2.0          300
                         gender/sexuality                3.0          300
                         health              

In [286]:
df.label.value_counts()

label
No          5353
Yes          347
Positive     242
Negative      34
Neutral       23
Unsure         1
Name: count, dtype: int64

In [287]:
df.loc[df.label == 'Yes', ['q_id', 'q_category', 'category']].value_counts(dropna=False, sort=False)

q_id                     q_category  category                     
economic_attributes       1.0        class membership                  8
                          2.0        employment status                26
                          3.0        education level                   5
                          4.0        income/wealth/economic status    26
                          5.0        occupation/profession            50
                          6.0        ecology of group                 11
non-economic_attributes   1.0        age                              43
                          2.0        family                           20
                          3.0        gender/sexuality                  8
                          4.0        place/location                    7
                          5.0        nationality                      46
                          6.0        ethnicity                         3
                          7.0        religion            

In [288]:
df.loc[df.q_id == 'stance', ['q_id', 'label']].value_counts(dropna=False, sort=False)

q_id    label   
stance  Negative     34
        Neutral      23
        Positive    242
        Unsure        1
Name: count, dtype: int64

In [289]:
idxs = df.groupby(['mention_id']).agg({'q_id': lambda x: x.nunique()})
idxs.q_id.value_counts() # all 300 should be 4

q_id
4    300
Name: count, dtype: int64

In [290]:
fp = '../../../data/intermediate/social_group_mentions_ranked.tsv'
texts = pd.read_csv(fp, sep='\t')
texts['mention_id'] = texts.text_id.astype(str) + '-' + texts.mention_nr.astype(str)
texts = texts[['mention_id', 'text', 'mention', 'prev_texts', 'next_texts']]

In [291]:
df = df.merge(texts, on='mention_id', how='left', indicator=True)

In [292]:
cols = list(texts.columns) + ['q_id', 'q_category', 'category', 'label', 'source']
df = df[cols]

In [231]:
fp = os.path.join(data_path, 'parsed',  'consolidated_annotations.tsv')
df.to_csv(fp, sep='\t', index=False)

In [295]:
df.head(2)

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,label,source
0,11110_198809-390636-1,Give parents the right to become municipal day...,parents,Extend parental insurance to two years with th...,Immediate legal right to reduce working hours ...,economic_attributes,1.0,class membership,No,coders
1,11110_198809-390636-1,Give parents the right to become municipal day...,parents,Extend parental insurance to two years with th...,Immediate legal right to reduce working hours ...,economic_attributes,6.0,ecology of group,No,coders


In [318]:
# in wider format

tmp = df.copy(deep=True)
tmp.loc[tmp.category == 'other', 'q_category'] = 99.0

# concate values in category where label=='Yes' into string of comma separated values, sorting by q_category
def concat(x):
    if x.category.nunique() == 1:
        return x.label.iloc[0]
    if x.category.isna().all():
        return x.label.iloc[0]
    x = x[x.label == 'Yes']
    if len(x) == 0:
        return ''
    x.sort_values('q_category', inplace=True)
    return ', '.join(x.category)
    
cols = ['mention_id', 'text', 'mention', 'prev_texts', 'next_texts', 'q_id']
df_wider = tmp.groupby(cols).apply(concat).reset_index().rename(columns={0: 'labels'})

# pivot wider values from 'labels' to columns using 'q_id' as columns
cols.remove('q_id')
df_wider = df_wider.pivot(index=cols, columns='q_id', values='labels').reset_index()
df_wider.columns.name = None

cols += ['universal_attributes', 'economic_attributes', 'non-economic_attributes', 'stance']
df_wider = df_wider[cols]

df_wider.head(2)

  df_wider = tmp.groupby(cols).apply(concat).reset_index().rename(columns={0: 'labels'})


Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,universal_attributes,economic_attributes,non-economic_attributes,stance
0,11110_198809-390636-1,Give parents the right to become municipal day...,parents,Extend parental insurance to two years with th...,Immediate legal right to reduce working hours ...,No,,family,Positive
1,11110_199109-390940-1,It is only within the ecological framework tha...,a society for survival in prosperity and well-...,Congress adopted this election manifesto on Ma...,Ecological integrity must be the basis of all ...,Yes,,,Positive
