In [1]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

In [2]:
train = pd.read_csv('../input/feedback-effective-folds/train_folds.csv')
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,score,fold
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,1,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,1,0
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,1,0
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,1,0
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,1,0


In [3]:
essays = pd.read_csv('../input/feedback-effective-folds/essay_scores.csv')
essays.head()

Unnamed: 0,essay_id,score,group,fold
0,F98E8D4EA700,0.777778,2,0
1,66BB82BD76B2,0.875,2,0
2,85F4C57672EA,0.923077,2,0
3,06936C8AA35D,1.666667,3,0
4,61C3ADEA1DD5,0.666667,2,0


In [4]:
texts = {x: "" for x in train.essay_id.unique().tolist()}
for f in os.listdir('../input/feedback-prize-effectiveness/train'):
    with open(f'../input/feedback-prize-effectiveness/train/{f}') as file:
        text = file.read()
        essay_id = f.split('.')[0]
        texts[essay_id] = text

In [5]:
train.groupby('essay_id')['discourse_id'].count().min(), train.groupby('essay_id')['discourse_id'].count().max()

(1, 23)

In [6]:
checkpoint = 'allenai/longformer-base-4096'
# checkpoint = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
def get_start_and_end(essay_id):
    discourse_id_2_startend = []
    all_discourses = train[train.essay_id == essay_id]['discourse_text'].values.tolist()
    all_ids = train[train.essay_id == essay_id]['discourse_id'].values.tolist()
    text = texts[essay_id]
    start = 0
    end = 0
    for i in range(len(all_discourses)):
        discourse = all_discourses[i]
        disc_id = all_ids[i]
        start = end + text[end:].find(discourse)
        # special case if we can't find exact match
        j = -1
        while start - end == -1:
            start = end + text[end:].find(discourse[:j])
            j -= 1
        # end of special case
        end = start + len(discourse)
        discourse_id_2_startend.append((disc_id, start, end))
    return discourse_id_2_startend

In [8]:
from tqdm.auto import tqdm 

mappings = []
for esid in tqdm(essays.essay_id.unique().tolist()):
    mappings.extend(get_start_and_end(esid))

  0%|          | 0/4191 [00:00<?, ?it/s]

In [9]:
mappings[0], len(mappings)

(('3abf8b923b81', 0, 40), 36765)

In [10]:
len(train)

36765

In [11]:
map_dict = {m[0]: (m[1],m[2]) for m in mappings}

In [12]:
train['start_end'] = train['discourse_id'].apply(lambda x: map_dict[x])

In [13]:
type2id = {x: i for i,x in enumerate(train.discourse_type.unique().tolist())}
type2id['Other'] = 7
type2id

{'Lead': 0,
 'Position': 1,
 'Claim': 2,
 'Evidence': 3,
 'Counterclaim': 4,
 'Rebuttal': 5,
 'Concluding Statement': 6,
 'Other': 7}

In [14]:
id2type = {i:x for x,i in type2id.items()}
id2type[-100] = 'Mask'
id2type

{0: 'Lead',
 1: 'Position',
 2: 'Claim',
 3: 'Evidence',
 4: 'Counterclaim',
 5: 'Rebuttal',
 6: 'Concluding Statement',
 7: 'Other',
 -100: 'Mask'}

In [15]:
def process_essay(esid, max_length=1536):

    token_class_labels = []
    token_scores_labels = []
    token_examples_mapping = []
    examples_scores = []
    examples_classes = []

    text = texts[esid]
    essay_score = essays[essays.essay_id == esid]['score'].values[0]
    tokens = tokenizer(text, return_offsets_mapping=True, max_length=max_length, truncation=True, padding='max_length')
    
    start_ends = train[train.essay_id == esid]['start_end'].values.tolist()
    discourse_types = train[train.essay_id == esid]['discourse_type'].values.tolist()
    scores = train[train.essay_id == esid]['score'].values.tolist()

    for ts,te in tokens['offset_mapping']:
        match = False
        if ts == te: 
            token_class_labels.append(-100)
            token_scores_labels.append(-100)
            token_examples_mapping.append(-100)
            match = True
        else:
            for i, ((ds,de), discourse_type, discourse_score) in enumerate(list(zip(
                start_ends,
                discourse_types,
                scores,
            ))):
                if ts >= ds and te <= de and ts != te:
                    token_class_labels.append(type2id[discourse_type])
                    token_scores_labels.append(discourse_score)
                    token_examples_mapping.append(i)
                    match = True
        if match == False:
            token_class_labels.append(type2id['Other'])
            token_scores_labels.append(-100)
            token_examples_mapping.append(-100)

    for i, (discourse_type, discourse_score) in enumerate(list(zip(
        discourse_types,
        scores,
    ))):
        examples_scores.append(discourse_score)
        examples_classes.append(type2id[discourse_type])
        
    try: 
        assert len(token_class_labels) == len(tokens['input_ids'])
        assert len(token_scores_labels) == len(tokens['input_ids'])
        assert len(token_examples_mapping) == len(tokens['input_ids'])
        assert len(examples_scores) == len(examples_classes)
        assert len(token_class_labels) == max_length

    except: 
        print('Wrong!!', esid)
        
    return {
        'essay_id': esid,
        'fold': essays[essays.essay_id == esid].fold.values[0],
        'input_ids': tokens['input_ids'],
        'attention_mask': tokens['attention_mask'],
        'offset_mapping': tokens['offset_mapping'],
        'token_class_labels': token_class_labels,
        'token_scores_labels': token_scores_labels,
        'token_examples_mapping': token_examples_mapping,
        'examples_scores': examples_scores,
        'examples_classes': examples_classes,
    }


In [16]:
from tqdm.auto import tqdm
processed_data = []
for esid in tqdm(train.essay_id.unique().tolist()):
    processed_data.append(process_essay(esid))

  0%|          | 0/4191 [00:00<?, ?it/s]

In [17]:
pdf = pd.DataFrame(processed_data)
pdf

Unnamed: 0,essay_id,fold,input_ids,attention_mask,offset_mapping,token_class_labels,token_scores_labels,token_examples_mapping,examples_scores,examples_classes
0,007ACE74B050,0,"[0, 30086, 6, 939, 437, 12370, 6, 939, 437, 16...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 2), (2, 3), (4, 5), (5, 7), (8, 1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1]","[0, 1, 2, 3, 4, 5, 3, 4, 6]"
1,00944C693682,4,"[0, 44222, 2838, 5, 9453, 9, 1677, 34, 1081, 8...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 8), (9, 12), (13, 18), (1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2]","[0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 6]"
2,00BD97EA4041,2,"[0, 31231, 7796, 1166, 5, 3722, 17528, 9, 521,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 16), (17, 21), (22, 25), ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 0, 1, 1, 1]","[0, 1, 2, 3, 3, 2, 6]"
3,00C6E82FE5BA,0,"[0, 100, 206, 14, 24, 1979, 75, 28, 923, 868, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 1), (2, 7), (8, 12), (13, 15), (1...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1]","[1, 2, 3, 3, 2, 6]"
4,013B9AA6B9DB,0,"[0, 2264, 16, 14, 631, 15, 6507, 116, 50118, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (5, 7), (8, 12), (13, 18), (1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 7, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, -100, 1, 1, 1, ...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]","[0, 1, 3, 4, 5, 3, 4, 5, 3, 2, 6]"
...,...,...,...,...,...,...,...,...,...,...
4186,FDF0AEEB14C3,1,"[0, 27524, 7, 334, 7476, 64, 28, 1202, 13, 140...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 5), (6, 8), (9, 15), (16, 24), (2...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2]","[0, 1, 2, 3, 4, 3, 5, 3, 3, 2, 3, 4, 5, 3, 6]"
4187,FE3CA06DDCA1,1,"[0, 7608, 16, 24, 77, 951, 6990, 47, 13, 2949,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (4, 6), (7, 9), (10, 14), (15...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 1, 1, 1]","[0, 1, 2, 2, 3, 3, 6]"
4188,FEF42864AE28,0,"[0, 14229, 10, 251, 183, 23, 334, 6, 33, 47, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 8), (9, 13), (14, 17), (1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2]","[0, 1, 2, 2, 2, 3, 4, 5, 2, 3, 4, 3, 5, 6]"
4189,FF9E0379CD98,4,"[0, 6323, 334, 904, 7018, 4086, 2239, 25, 10, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (5, 11), (12, 17), (18, 22), ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]","[0, 1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 4, 5, 3, 6, 4]"


In [18]:
l = []
for i in range(len(pdf)):
    l.append(max(pdf['token_examples_mapping'].loc[i]))
min(l)

0

In [19]:
import pickle
with open('processed-longformer-1536.pickle', 'wb') as handle:
    pickle.dump(pdf, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

In [20]:
len(pdf.token_scores_labels.loc[4])

1536

In [21]:
for i in range(20):
    print(len(pdf.input_ids.loc[i]))

1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
