# Training a model for predicting annotations

**This notebook produces the ground truth datasets (train/develop/test) for training the mBERT, XLM-RoBERTa, robBERT models.**

## Making Ground Truth 

In [1]:
import pandas as pd

# this file is produced in notebook 2
anno_file = '../data/review_sentence_classification-majority_vote.tsv'

df = pd.read_csv(anno_file, sep='\t')
df.head(2)

Unnamed: 0,review_id,sent_num,sent_offset,sent_end,sent_text,Author,Classification,Content,Content--Narrative,Content--Other,...,Reader_response--Feelings,Reader_response--Identification_and_immersion,Reader_response--Reading_Context,Reader_response--Reception,Reader_response--Reflection,Recommendations,Style,Style--Context,Style--Structure,Style--Stylistic_features
0,impfic-review-10274,1,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,impfic-review-10274,2,79,218,op 11 november 2011 verscheen 'Naar de overkan...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


## Prepare intput file

This should be a text file with each sentence on a line and the labels in one or more separate columns using `|` as separator. Each column is a learning task.

In [2]:
df['sent_text_clean'] = df.sent_text.apply(lambda x: x.replace('\n', ' '))

In [3]:
def map_cats(row, cats):
    new_row = [cat if row[cat] == 1 else None for cat in cats]
    # print(new_row)
    new_row = [cat for cat in new_row if cat is not None]
    if len(new_row) == 0:
        new_row = ['None']
    cat_string = '|'.join(new_row)
    # print(cat_string)
    return cat_string.lower()

    
main_cats = [
    'Author', 'Classification', 'Content',
    'Other_works', 'Reader_response',
    'Recommendations', 'Style',
]

sub_cats = [
    'Content--Narrative',
    'Content--Other', 'Content--Quote', 'Content--Theme', 
    'Reader_response--Evaluation_of_quality', 'Reader_response--Feelings',
    'Reader_response--Identification_and_immersion',
    'Reader_response--Reading_Context', 'Reader_response--Reception',
    'Reader_response--Reflection', 
    'Style--Context', 'Style--Structure', 'Style--Stylistic_features'
]

all_cats = main_cats + sub_cats
{cat: cat for cat in all_cats}

cat_map = {
    'Author': 'Author',
    'Classification': 'Classification',
    'Content': 'Content',
    'Other_works': 'Other_works',
    'Reader_response': 'Reader_response',
    'Recommendations': 'Recommendations',
    'Style': 'Style',
    'Content--Narrative': 'Content--Narrative',
    'Content--Other': 'Content--Other',
    'Content--Quote': 'Content--Quote',
    'Content--Theme': 'Content--Theme',
    'Reader_response--Evaluation_of_quality': 'Reader_response--Evaluation_of_quality',
    'Reader_response--Feelings': 'Reader_response--Feelings',
    'Reader_response--Identification_and_immersion': 'Reader_response--Identification_and_immersion',
    'Reader_response--Reading_Context': 'Reader_response--Reading_Context',
    'Reader_response--Reception': 'Reader_response--Reception',
    'Reader_response--Reflection': 'Reader_response--Reflection',
    'Style--Context': 'Style--Context',
    'Style--Structure': 'Style--Structure',
}

In [17]:
def map_cat_short(cat):
    short = cat[:3].lower()
    if '--' in cat:
        short = f"{short}_{cat.split('--')[-1][:3].lower()}"
    return short


def map_binary_cat(row, columns, cats):
    new_row = []
    for field in columns:
        if field in cats:
            new_row.append(map_cat_short(field))
        else:
            new_row.append(row[field])
    return new_row

cols = list(df.columns)
# new_df = df.apply(lambda row: map_binary_cat(row, cols, all_cats), axis=1)
# new_df
short_cat_map = {cat: map_cat_short(cat) for cat in all_cats}
for cat in all_cats:
    df[short_cat_map[cat]] = df[cat].apply(lambda x: short_cat_map[cat] if x == 1 else '')

In [20]:
short_cats = list(short_cat_map.values())


In [84]:


df
df[['sent_text'] + cat_cols]
df[['sent_text'] + main_cats]
[mc.lower() for mc in main_cats]
df['cat_string'] = df.apply(lambda row: map_cats(row, main_cats), axis=1)

In [85]:
df[cat_cols].sum(axis=1)

0        1
1        2
2        2
3        2
4        2
        ..
11629    4
11630    2
11631    5
11632    1
11633    0
Length: 11634, dtype: int64

In [86]:
df

Unnamed: 0,review_id,sent_num,sent_offset,sent_end,sent_text,Author,Classification,Content,Content--Narrative,Content--Other,...,Reader_response--Reading_Context,Reader_response--Reception,Reader_response--Reflection,Recommendations,Style,Style--Context,Style--Structure,Style--Stylistic_features,sent_text_clean,cat_string
0,impfic-review-10274,1,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Jan van Mersbergen (1971) publiceerde sinds 20...,author
1,impfic-review-10274,2,79,218,op 11 november 2011 verscheen 'Naar de overkan...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,op 11 november 2011 verscheen 'Naar de overkan...,content
2,impfic-review-10274,3,219,387,"Hij ontmoet dwazen en wijzen, leert de ware aa...",0,0,1,1,0,...,0,0,0,0,0,0,0,0,"Hij ontmoet dwazen en wijzen, leert de ware aa...",content
3,impfic-review-10274,4,388,491,"Zo ontdekt hij zijn ware identiteit, geeft hij...",0,0,1,1,0,...,0,0,0,0,0,0,0,0,"Zo ontdekt hij zijn ware identiteit, geeft hij...",content
4,impfic-review-10274,5,492,732,"Zijn verleden als schipperskind, zijn heden al...",0,0,1,1,0,...,0,0,0,0,0,0,0,0,"Zijn verleden als schipperskind, zijn heden al...",content
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11629,impfic-review-84086,4,242,355,Cia is een meisje wat goed uitgeschreven wordt...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,Cia is een meisje wat goed uitgeschreven wordt...,content|reader_response
11630,impfic-review-84086,5,356,446,Ik denk dat ik niet veel anders zou doen dan h...,0,0,0,0,0,...,0,0,1,0,0,0,0,0,Ik denk dat ik niet veel anders zou doen dan h...,reader_response
11631,impfic-review-84086,6,447,544,Ik kon me weer lekker laten gaan tijdens het l...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,Ik kon me weer lekker laten gaan tijdens het l...,content|reader_response
11632,impfic-review-84086,7,545,572,Ik ga nu verder met deel 2!,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Ik ga nu verder met deel 2!,other_works


In [87]:
df[df.cat_string.apply(lambda x: '|' in x)]
#df.head(10)

Unnamed: 0,review_id,sent_num,sent_offset,sent_end,sent_text,Author,Classification,Content,Content--Narrative,Content--Other,...,Reader_response--Reading_Context,Reader_response--Reception,Reader_response--Reflection,Recommendations,Style,Style--Context,Style--Structure,Style--Stylistic_features,sent_text_clean,cat_string
5,impfic-review-10274,6,733,793,Van Mersbergen maakt de dronkenschap voelbaar ...,1,0,0,0,0,...,0,0,0,0,1,0,0,1,Van Mersbergen maakt de dronkenschap voelbaar ...,author|style
6,impfic-review-10274,7,794,881,"Hij schreef een knap geconstrueerde roman, zo ...",0,0,0,0,0,...,0,0,0,0,1,0,0,1,"Hij schreef een knap geconstrueerde roman, zo ...",reader_response|style
7,impfic-review-10274,8,882,992,alle details kloppen en hebben een literaire b...,1,0,0,0,0,...,0,0,0,0,1,0,0,1,alle details kloppen en hebben een literaire b...,author|reader_response|style
10,impfic-review-10429,1,0,161,Tweede roman van de Nederlandse schrijver (198...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Tweede roman van de Nederlandse schrijver (198...,author|other_works
12,impfic-review-10429,3,271,469,Hoofdpersoon is de student Philip Hofman ‚Äì Huf...,1,0,1,1,0,...,0,0,0,0,0,0,0,0,Hoofdpersoon is de student Philip Hofman ‚Äì Huf...,author|content|reader_response
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11623,impfic-review-83874,22,2989,3095,Er zijn aan het eind van het verhaal nog enkel...,0,0,1,1,0,...,0,0,1,0,0,0,0,0,Er zijn aan het eind van het verhaal nog enkel...,content|reader_response
11624,impfic-review-83874,23,3096,3222,Het is eigenlijk een boek als geen ander en da...,0,1,0,0,0,...,0,0,1,0,0,0,0,0,Het is eigenlijk een boek als geen ander en da...,classification|reader_response
11626,impfic-review-84086,1,0,77,Het andere boek wat ik gelezen heb van deze sc...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Het andere boek wat ik gelezen heb van deze sc...,author|other_works
11629,impfic-review-84086,4,242,355,Cia is een meisje wat goed uitgeschreven wordt...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,Cia is een meisje wat goed uitgeschreven wordt...,content|reader_response


## Malking Train/Dev/Test Split

In [21]:
random_state = 98392

In [22]:

total = len(df)
chunk_size = int(total / 10)
chunk_size
test_df = df.sample(chunk_size, random_state=random_state)
rest_df = df[df.index.isin(test_df.index) == False]
dev_df = rest_df.sample(chunk_size, random_state=random_state)
train_df = rest_df[rest_df.index.isin(dev_df.index) == False]
df.shape, test_df.shape, dev_df.shape, train_df.shape


((11634, 47), (1163, 47), (1163, 47), (9308, 47))

In [89]:
#f"{row['sent_text'].strip()}\t{cat_string}"

df[['sent_text_clean', 'cat_string']].to_csv('../data/ground_truth-cats_main-task_type_single.tsv', sep='\t', index=False)

In [26]:
train_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_train-cats_all-task_type_multi.tsv', sep='\t', index=False, header=False)

dev_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_dev-cats_all-task_type_multi.tsv', sep='\t', index=False, header=False)

test_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_test-cats_all-task_type_multi.tsv', sep='\t', index=False, header=False)


### Different Split Ratios

In [88]:
random_state = 98392

In [89]:
len(df)

11634

In [91]:
total = len(df)
chunk_size = int(total / 5)
chunk_size
test_df = df.sample(chunk_size, random_state=random_state)
rest_df = df[df.index.isin(test_df.index) == False]
dev_df = rest_df.sample(chunk_size, random_state=random_state)
train_df = rest_df[rest_df.index.isin(dev_df.index) == False]
assert len(df) == len(test_df) + len(dev_df) + len(train_df)
df.shape, test_df.shape, dev_df.shape, train_df.shape


((11634, 48), (2326, 48), (2326, 48), (6982, 48))

In [104]:
#test_df

In [101]:
from itertools import combinations

sent_set = {
    'test': set(test_df.sent_text),
    'train': set(train_df.sent_text),
    'dev': set(dev_df.sent_text)    
}

for s1, s2 in combinations(sent_set.keys(), 2):
    overlap = sent_set[s1].intersection(sent_set[s2])
    num_overlap = len(overlap)
    print(f"overlap in sentences between {s1} and {s2}: {num_overlap}")
    print(overlap)
    print('\n')

overlap in sentences between test and train: 14
{'Erg jammer.', 'Voor een grote lezerskring.', 'Vanaf ca. 15 jaar.', 'Mijn mening:', 'Het verhaal:', '"', 'Vrij kleine druk.', 'Kleine druk.', 'Een aanrader!', 'Zeker een aanrader.', 'Kortom:', 'Samenvatting:', 'Normale druk.', 'Paperback met normale druk.'}


overlap in sentences between test and dev: 10
{'Prachtig boek!', 'Vanaf ca. 15 jaar.', 'Een echte aanrader!', '"', 'Kleine druk.', 'Een aanrader!', 'Wat een heerlijk boek om te lezen!', 'Kortom:', 'Normale druk.', 'Paperback met normale druk.'}


overlap in sentences between train and dev: 16
{'Jammer.', 'Eerste deel van een serie.', 'Vanaf ca. 13 jaar.', 'Vanaf ca. 15 jaar.', '"', 'Conclusie', '‚ù§Ô∏è', 'Een aanrader.', 'Kleine druk.', 'Een aanrader!', 'Ja.', 'Kortom:', '.', 'Normale druk.', 'Echt een aanrader!', 'Paperback met normale druk.'}




In [29]:
train_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_train-cats_all-task_type_multi-split_0.2.tsv', sep='\t', index=False, header=False)

dev_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_dev-cats_all-task_type_multi-split_0.2.tsv', sep='\t', index=False, header=False)

test_df[['sent_text_clean'] + short_cats].to_csv('../data/ground_truth/ground_truth_test-cats_all-task_type_multi-split_0.2.tsv', sep='\t', index=False, header=False)


### Cross Validation

In [82]:
import random

random_state = 98392
total = len(df)

num_folds = 10
chunk_size = int(total / num_folds)
chunk_size
len(df)
fold_assignments = []
for idx in range(num_folds):
    fold_assignments.extend([idx] * chunk_size)

missing = len(df) - len(fold_assignments)
print(f"missing: {missing}")
fold_assignments.extend([idx for idx in range(missing)])
len(fold_assignments), len(df)
random.Random(random_state).shuffle(fold_assignments)
fold_assignments[:10]

df['fold'] = fold_assignments

missing: 4


In [83]:
df[df.fold > 9]

Unnamed: 0,review_id,sent_num,sent_offset,sent_end,sent_text,Author,Classification,Content,Content--Narrative,Content--Other,...,rea_eva,rea_fee,rea_ide,rea_rea,rea_rec,rea_ref,sty_con,sty_str,sty_sty,fold


In [84]:
df.head(2)
df.fold.value_counts()

fold
2    1164
1    1164
0    1164
3    1164
5    1163
7    1163
4    1163
9    1163
6    1163
8    1163
Name: count, dtype: int64

In [85]:
import os

for fold in range(num_folds):
    test_fold = fold
    dev_fold = (fold+1) % num_folds
    used = [test_fold, dev_fold]
    train_folds = [f for f in range(num_folds) if f not in used]
    print(test_fold, dev_fold, train_folds)
    
    fold_dir = f'../data/ground_truth/folds_{num_folds}/'
    if os.path.exists(fold_dir) is False:
        os.mkdir(fold_dir)
    
    split_dir = os.path.join(fold_dir, f"split_{1/num_folds:.1f}-fold_{fold}")
    if os.path.exists(split_dir) is False:
        os.mkdir(split_dir)
    
    test_df = df[df.fold == test_fold]
    dev_df = df[df.fold == dev_fold]
    train_df = df[df.fold.isin(train_folds)]
    print(fold, len(test_df), len(dev_df), len(train_df))
    
    out_fname = f'ground_truth_train-cats_all-task_type_multi-split_{1/num_folds:.1f}.tsv'
    out_filepath = os.path.join(split_dir, out_fname)
    temp_df = train_df[['sent_text_clean'] + short_cats]
    temp_df.to_csv(out_filepath, sep='\t', index=False, header=False)

    out_fname = f'ground_truth_dev-cats_all-task_type_multi-split_{1/num_folds:.1f}.tsv'
    out_filepath = os.path.join(split_dir, out_fname)
    temp_df = dev_df[['sent_text_clean'] + short_cats]
    temp_df.to_csv(out_filepath, sep='\t', index=False, header=False)
    
    out_fname = f'ground_truth_test-cats_all-task_type_multi-split_{1/num_folds:.1f}.tsv'
    out_filepath = os.path.join(split_dir, out_fname)
    temp_df = test_df[['sent_text_clean'] + short_cats]
    temp_df.to_csv(out_filepath, sep='\t', index=False, header=False)


0 1 [2, 3, 4, 5, 6, 7, 8, 9]
0 1164 1164 9306
1 2 [0, 3, 4, 5, 6, 7, 8, 9]
1 1164 1164 9306
2 3 [0, 1, 4, 5, 6, 7, 8, 9]
2 1164 1164 9306
3 4 [0, 1, 2, 5, 6, 7, 8, 9]
3 1164 1163 9307
4 5 [0, 1, 2, 3, 6, 7, 8, 9]
4 1163 1163 9308
5 6 [0, 1, 2, 3, 4, 7, 8, 9]
5 1163 1163 9308
6 7 [0, 1, 2, 3, 4, 5, 8, 9]
6 1163 1163 9308
7 8 [0, 1, 2, 3, 4, 5, 6, 9]
7 1163 1163 9308
8 9 [0, 1, 2, 3, 4, 5, 6, 7]
8 1163 1163 9308
9 0 [1, 2, 3, 4, 5, 6, 7, 8]
9 1163 1164 9307


In [None]:
test_df = df.sample(chunk_size, random_state=random_state)
rest_df = df[df.index.isin(test_df.index) == False]
dev_df = rest_df.sample(chunk_size, random_state=random_state)
train_df = rest_df[rest_df.index.isin(dev_df.index) == False]
df.shape, test_df.shape, dev_df.shape, train_df.shape


### Single Task and Main-only

In [90]:


test_df[['sent_text_clean', 'cat_string']].to_csv('../data/ground_truth_test-cats_main-task_type_single.tsv', sep='\t', index=False)
dev_df[['sent_text_clean', 'cat_string']].to_csv('../data/ground_truth_dev-cats_main-task_type_single.tsv', sep='\t', index=False)
train_df[['sent_text_clean', 'cat_string']].to_csv('../data/ground_truth_train-cats_main-task_type_single.tsv', sep='\t', index=False)


In [91]:
test_df[['sent_text_clean'] + all_cats].to_csv('../data/ground_truth_test-cats_all-task_type_multi.tsv', sep='\t', index=False)
dev_df[['sent_text_clean'] + all_cats].to_csv('../data/ground_truth_dev-cats_all-task_type_multi.tsv', sep='\t', index=False)
train_df[['sent_text_clean'] + all_cats].to_csv('../data/ground_truth_train-cats_all-task_type_multi.tsv', sep='\t', index=False)


In [105]:
pos_freq = dev_df[all_cats].sum().rename('pos_freq')
pos_frac = (dev_df[all_cats].sum() / len(dev_df)).rename('pos_frac')
pd.concat([pos_freq, pos_frac], axis=1)

Unnamed: 0,pos_freq,pos_frac
Author,246,0.105761
Classification,73,0.031384
Content,1231,0.529235
Other_works,119,0.051161
Reader_response,1168,0.50215
Recommendations,53,0.022786
Style,232,0.099742
Content--Narrative,1057,0.454428
Content--Other,55,0.023646
Content--Quote,59,0.025365


In [93]:
len(df)

11634

In [94]:
pos = dev_df[all_cats].sum()
neg = len(dev_df) - pos
neg

Author                                           1026
Classification                                   1133
Content                                           539
Other_works                                      1089
Reader_response                                   596
Recommendations                                  1142
Style                                            1050
Content--Narrative                                637
Content--Other                                   1132
Content--Quote                                   1126
Content--Theme                                   1142
Reader_response--Evaluation_of_quality            862
Reader_response--Feelings                        1083
Reader_response--Identification_and_immersion    1130
Reader_response--Reading_Context                 1103
Reader_response--Reception                       1158
Reader_response--Reflection                      1075
Style--Context                                   1160
Style--Structure            

In [95]:
neg / len(dev_df)

Author                                           0.882201
Classification                                   0.974205
Content                                          0.463457
Other_works                                      0.936371
Reader_response                                  0.512468
Recommendations                                  0.981943
Style                                            0.902837
Content--Narrative                               0.547721
Content--Other                                   0.973345
Content--Quote                                   0.968186
Content--Theme                                   0.981943
Reader_response--Evaluation_of_quality           0.741187
Reader_response--Feelings                        0.931212
Reader_response--Identification_and_immersion    0.971625
Reader_response--Reading_Context                 0.948409
Reader_response--Reception                       0.995701
Reader_response--Reflection                      0.924334
Style--Context

In [9]:
df[df.sent_text_clean.str.contains('volledig uitgewerkt')].to_dict('records')

[{'review_id': 'impfic-review-206525',
  'sent_num': 2,
  'sent_offset': 172,
  'sent_end': 233,
  'sent_text': 'ü§∑üèª\u200d‚ôÄÔ∏èü§® is al het 2de boek dat niet volledig uitgewerkt is.\nIk',
  'Author': 0,
  'Classification': 0,
  'Content': 0,
  'Content--Narrative': 0,
  'Content--Other': 0,
  'Content--Quote': 0,
  'Content--Theme': 0,
  'None': 0,
  'Other_works': 0,
  'Reader_response': 1,
  'Reader_response--Evaluation_of_quality': 1,
  'Reader_response--Feelings': 0,
  'Reader_response--Identification_and_immersion': 0,
  'Reader_response--Reading_Context': 0,
  'Reader_response--Reception': 0,
  'Reader_response--Reflection': 0,
  'Recommendations': 0,
  'Style': 0,
  'Style--Context': 0,
  'Style--Structure': 0,
  'Style--Stylistic_features': 0,
  'sent_text_clean': 'ü§∑üèª\u200d‚ôÄÔ∏èü§® is al het 2de boek dat niet volledig uitgewerkt is. Ik',
  'cat_string': 'reader_response'}]

## Preparing configuratino files

In [28]:
import json

config_file = 'machamp_config-cat_all-task_type_main.json'

with open(config_file, 'rt') as fh:
    config = json.load(fh)

config

{'MAIN_MULTITASK_SINGLE_CLASS': {'train_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_train-cats_all-task_type_multi.tsv',
  'dev_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_dev-cats_all-task_type_multi.tsv',
  'sent_idxs': [0],
  'tasks': {'author': {'task_type': 'classification', 'column_idx': 1},
   'classification': {'task_type': 'classification', 'column_idx': 2},
   'content': {'task_type': 'classification', 'column_idx': 3},
   'other_works': {'task_type': 'classification', 'column_idx': 4},
   'reader_response': {'task_type': 'classification', 'column_idx': 5},
   'recommendation': {'task_type': 'classification', 'column_idx': 6},
   'style': {'task_type': 'classification', 'column_idx': 7}}}}

In [36]:
old_name = 'MAIN_MULTITASK_SINGLE_CLASS'
new_name = 'ALL_CAT_MULTITASK_SINGLE_CLASS'
config[old_name].keys()

dict_keys(['train_data_path', 'dev_data_path', 'sent_idxs', 'tasks'])

In [35]:
new_config = {
    new_name: config[old_name]
}

new_config

{'ALL_CAT_MULTITASK_SINGLE_CLASS': {'train_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_train-cats_all-task_type_multi.tsv',
  'dev_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_dev-cats_all-task_type_multi.tsv',
  'sent_idxs': [0],
  'tasks': {'author': {'task_type': 'classification', 'column_idx': 1},
   'classification': {'task_type': 'classification', 'column_idx': 2},
   'content': {'task_type': 'classification', 'column_idx': 3},
   'other_works': {'task_type': 'classification', 'column_idx': 4},
   'reader_response': {'task_type': 'classification', 'column_idx': 5},
   'recommendation': {'task_type': 'classification', 'column_idx': 6},
   'style': {'task_type': 'classification', 'column_idx': 7}}}}

In [41]:
all_cats
tasks = {cat: {'task_type': 'classification', 'column_idx': ci+1} for ci, cat in enumerate(all_cats)}
    
tasks
new_config[new_name]['tasks'] = tasks

In [42]:
new_config

{'ALL_CAT_MULTITASK_SINGLE_CLASS': {'train_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_train-cats_all-task_type_multi.tsv',
  'dev_data_path': '/Users/marijnkoolen/Code/Papers/2025/CHR-2025-review-composition/data/ground_truth_dev-cats_all-task_type_multi.tsv',
  'sent_idxs': [0],
  'tasks': {'Author': {'task_type': 'classification', 'column_idx': 1},
   'Classification': {'task_type': 'classification', 'column_idx': 2},
   'Content': {'task_type': 'classification', 'column_idx': 3},
   'Other_works': {'task_type': 'classification', 'column_idx': 4},
   'Reader_response': {'task_type': 'classification', 'column_idx': 5},
   'Recommendations': {'task_type': 'classification', 'column_idx': 6},
   'Style': {'task_type': 'classification', 'column_idx': 7},
   'Content--Narrative': {'task_type': 'classification', 'column_idx': 8},
   'Content--Other': {'task_type': 'classification', 'column_idx': 9},
   'Content--Quote': {'task_type': 'cla

In [43]:
new_config_file = 'machamp_config-cat_all-task_type_multi_all.json'
with open(new_config_file, 'wt') as fh:
    json.dump(new_config, fh)