In [1]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

In [2]:
train = pd.read_csv('../input/feedback-effective-folds/train_folds.csv')
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,score,fold
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,1,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,1,0
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,1,0
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,1,0
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,1,0


In [3]:
essays = pd.read_csv('../input/feedback-effective-folds/essay_scores.csv')
essays.head()

Unnamed: 0,essay_id,score,group,fold
0,F98E8D4EA700,0.777778,2,0
1,66BB82BD76B2,0.875,2,0
2,85F4C57672EA,0.923077,2,0
3,06936C8AA35D,1.666667,3,0
4,61C3ADEA1DD5,0.666667,2,0


In [4]:
texts = {x: "" for x in train.essay_id.unique().tolist()}
for f in os.listdir('../input/feedback-prize-effectiveness/train'):
    with open(f'../input/feedback-prize-effectiveness/train/{f}') as file:
        text = file.read()
        essay_id = f.split('.')[0]
        texts[essay_id] = text

In [5]:
train.groupby('essay_id')['discourse_id'].count().min(), train.groupby('essay_id')['discourse_id'].count().max()

(1, 23)

In [6]:
checkpoint = 'microsoft/deberta-v3-large'
# checkpoint = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def get_start_and_end(essay_id):
    discourse_id_2_startend = []
    all_discourses = train[train.essay_id == essay_id]['discourse_text'].values.tolist()
    all_ids = train[train.essay_id == essay_id]['discourse_id'].values.tolist()
    text = texts[essay_id]
    start = 0
    end = 0
    for i in range(len(all_discourses)):
        discourse = all_discourses[i]
        disc_id = all_ids[i]
        start = end + text[end:].find(discourse)
        # special case if we can't find exact match
        j = -1
        while start - end == -1:
            start = end + text[end:].find(discourse[:j])
            j -= 1
        # end of special case
        end = start + len(discourse)
        discourse_id_2_startend.append((disc_id, start, end))
    return discourse_id_2_startend

In [8]:
from tqdm.auto import tqdm 

mappings = []
for esid in tqdm(essays.essay_id.unique().tolist()):
    mappings.extend(get_start_and_end(esid))

  0%|          | 0/4191 [00:00<?, ?it/s]

In [9]:
mappings[0], len(mappings)

(('3abf8b923b81', 0, 40), 36765)

In [10]:
len(train)

36765

In [11]:
map_dict = {m[0]: (m[1],m[2]) for m in mappings}

In [12]:
train['start_end'] = train['discourse_id'].apply(lambda x: map_dict[x])

In [13]:
type2id = {x: i for i,x in enumerate(train.discourse_type.unique().tolist())}
type2id['Other'] = 7
type2id

{'Lead': 0,
 'Position': 1,
 'Claim': 2,
 'Evidence': 3,
 'Counterclaim': 4,
 'Rebuttal': 5,
 'Concluding Statement': 6,
 'Other': 7}

In [14]:
id2type = {i:x for x,i in type2id.items()}
id2type[-100] = 'Mask'
id2type

{0: 'Lead',
 1: 'Position',
 2: 'Claim',
 3: 'Evidence',
 4: 'Counterclaim',
 5: 'Rebuttal',
 6: 'Concluding Statement',
 7: 'Other',
 -100: 'Mask'}

In [15]:
def chck(a):
    mx = max(a)
    try:
        for i in range(mx+1):
            assert(i in a)
        return True
    except:
        return False

In [16]:
def process_essay(esid, max_length=1024):

    token_class_labels = []
    token_scores_labels = []
    token_examples_mapping = []
    examples_scores = []
    examples_classes = []

    text = texts[esid]
    essay_score = essays[essays.essay_id == esid]['score'].values[0]
    tokens = tokenizer(text, return_offsets_mapping=True, max_length=max_length, truncation=True, padding='max_length')
    
    start_ends = train[train.essay_id == esid]['start_end'].values.tolist()
    discourse_types = train[train.essay_id == esid]['discourse_type'].values.tolist()
    scores = train[train.essay_id == esid]['score'].values.tolist()

    for ts,te in tokens['offset_mapping']:
        match = False
        if ts == te: 
            token_class_labels.append(-100)
            token_scores_labels.append(-100)
            token_examples_mapping.append(-100)
            match = True
        else:
            for i, ((ds,de), discourse_type, discourse_score) in enumerate(list(zip(
                start_ends,
                discourse_types,
                scores,
            ))):
                if ts >= ds-1 and te <= de and ts != te:
                    token_class_labels.append(type2id[discourse_type])
                    token_scores_labels.append(discourse_score)
                    token_examples_mapping.append(i)
                    match = True
                    break
        if match == False:
            token_class_labels.append(type2id['Other'])
            token_scores_labels.append(-100)
            token_examples_mapping.append(-100)
#         print(ts,te,match)

    for i, (discourse_type, discourse_score) in enumerate(list(zip(
        discourse_types,
        scores,
    ))):
        examples_scores.append(discourse_score)
        examples_classes.append(type2id[discourse_type])
        
    try: 
        assert len(token_class_labels) == len(tokens['input_ids'])
        assert len(token_scores_labels) == len(tokens['input_ids'])
        assert len(token_examples_mapping) == len(tokens['input_ids'])
        assert len(examples_scores) == len(examples_classes)
        assert len(token_class_labels) == max_length
        assert chck(token_examples_mapping)

    except: 
        print('Wrong!!', esid)
        
    return {
        'essay_id': esid,
        'fold': essays[essays.essay_id == esid].fold.values[0],
        'input_ids': tokens['input_ids'],
        'attention_mask': tokens['attention_mask'],
        'offset_mapping': tokens['offset_mapping'],
        'token_class_labels': token_class_labels,
        'token_scores_labels': token_scores_labels,
        'token_examples_mapping': token_examples_mapping,
        'examples_scores': examples_scores,
        'examples_classes': examples_classes,
    }


In [17]:
train[train.essay_id == '0F2199921C33']

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,score,fold,start_end
780,89554ddc22e1,0F2199921C33,Do you remember the summer projects that you h...,Lead,Adequate,1,0,"(0, 302)"
781,cec88a31dcf3,0F2199921C33,The summer projects should be teacher designed,Position,Adequate,1,0,"(302, 349)"
782,c81b32b22872,0F2199921C33,it gives the teacher a chance to gauge the stu...,Claim,Effective,2,0,"(357, 424)"
783,27ca36a0bda2,0F2199921C33,base it on what they will be learning that year,Claim,Effective,2,0,"(424, 472)"
784,9a6b9a33d90a,0F2199921C33,and learn the learning style of students. \n,Claim,Effective,2,0,"(472, 515)"
785,614cd99fec55,0F2199921C33,Some say that student designed summer projects...,Counterclaim,Effective,2,0,"(515, 675)"
786,a2c4308dd1bf,0F2199921C33,And while it may be good that the student is i...,Rebuttal,Effective,2,0,"(675, 939)"
787,655dee8bcbf1,0F2199921C33,A student who is knowable able about history m...,Evidence,Effective,2,0,"(940, 1954)"
788,6c1d9b65465f,0F2199921C33,Teacher-designed summer projects allow teacher...,Claim,Effective,2,0,"(1955, 2055)"
789,a5d11fe804cd,0F2199921C33,The teacher can create different problems base...,Evidence,Effective,2,0,"(2055, 2801)"


In [18]:
len(train[train.essay_id == '0F2199921C33'])

15

In [19]:
ttt = process_essay('0F2199921C33')

In [20]:
ttt.keys()

dict_keys(['essay_id', 'fold', 'input_ids', 'attention_mask', 'offset_mapping', 'token_class_labels', 'token_scores_labels', 'token_examples_mapping', 'examples_scores', 'examples_classes'])

In [21]:
token_examples_mapping = ttt['token_examples_mapping']
examples_scores = ttt['examples_scores']
input_ids = ttt['input_ids']

num_ex = max(token_examples_mapping)

num_ex, len(examples_scores)

import torch
# for i in range(num_ex + 1):
#     indices = torch.tensor(token_examples_mapping) == i
#     tokens = torch.tensor(input_ids)[indices]
#     text = tokenizer.decode(tokens)
#     print(text)

for i, (_, row) in enumerate(train[train.essay_id == '0F2199921C33'].iterrows()):
    indices = torch.tensor(token_examples_mapping) == i
    tokens = torch.tensor(input_ids)[indices]
    text = tokenizer.decode(tokens)
    text2 = row['discourse_text']
    print(i, text==text2)
    print(text)
    print(text2)
    

0 False
Do you remember the summer projects that you had to do during summer break? Even now as fewer places require summer projects for all class,they are still an important thing for both students and teachers. They allow both so see how the other is either by teaching style or how they did on the content.
Do you remember the summer projects that you had to do during summer break? Even now as fewer places require summer projects for all class,they are still an important thing for both students and teachers. They allow both so see how the other is either by teaching style or how they did on the content. 
1 False
The summer projects should be teacher designed
The summer projects should be teacher designed 
2 False
it gives the teacher a chance to gauge the students academic level,
it gives the teacher a chance to gauge the students academic level 
3 False
base it on what they will be learning that year,
base it on what they will be learning that year 
4 False
and learn the learning sty

In [22]:
from tqdm.auto import tqdm
processed_data = []
for esid in tqdm(train.essay_id.unique().tolist()):
    processed_data.append(process_essay(esid))

  0%|          | 0/4191 [00:00<?, ?it/s]

In [23]:
pdf = pd.DataFrame(processed_data)
pdf

Unnamed: 0,essay_id,fold,input_ids,attention_mask,offset_mapping,token_class_labels,token_scores_labels,token_examples_mapping,examples_scores,examples_classes
0,007ACE74B050,0,"[1, 2684, 261, 584, 280, 358, 11759, 261, 584,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 2), (2, 3), (3, 5), (5, 6), (6, 7...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1]","[0, 1, 2, 3, 4, 5, 3, 4, 6]"
1,00944C693682,4,"[1, 79853, 262, 4119, 265, 2020, 303, 753, 263...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 8), (8, 12), (12, 18), (18, 21), ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2]","[0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 6]"
2,00BD97EA4041,2,"[1, 3432, 4659, 623, 262, 2955, 10172, 265, 59...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (6, 16), (16, 21), (21, 25), ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 0, 1, 1, 1]","[0, 1, 2, 3, 3, 2, 6]"
3,00C6E82FE5BA,0,"[1, 273, 428, 272, 278, 1804, 280, 297, 282, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 1), (1, 7), (7, 12), (12, 15), (1...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1]","[1, 2, 3, 3, 2, 6]"
4,013B9AA6B9DB,0,"[1, 458, 269, 272, 576, 277, 7583, 302, 1369, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 7), (7, 12), (12, 18), (1...","[-100, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]","[0, 1, 3, 4, 5, 3, 4, 5, 3, 2, 6]"
...,...,...,...,...,...,...,...,...,...,...
4186,FDF0AEEB14C3,1,"[1, 7896, 264, 563, 3343, 295, 282, 1179, 270,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 5), (5, 8), (8, 15), (15, 24), (2...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2]","[0, 1, 2, 3, 4, 3, 5, 3, 3, 2, 3, 4, 5, 3, 6]"
4187,FE3CA06DDCA1,1,"[1, 1167, 269, 278, 335, 760, 5387, 274, 270, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 6), (6, 9), (9, 14), (14,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 1, 1, 1]","[0, 1, 2, 2, 3, 3, 6]"
4188,FEF42864AE28,0,"[1, 1717, 266, 455, 406, 288, 563, 261, 286, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (6, 8), (8, 13), (13, 17), (1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2]","[0, 1, 2, 2, 2, 3, 4, 5, 2, 3, 4, 3, 5, 6]"
4189,FF9E0379CD98,4,"[1, 879, 563, 630, 50240, 12025, 1101, 283, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 11), (11, 17), (17, 22), ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]","[0, 1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 4, 5, 3, 6, 4]"


In [24]:
import pickle
with open(f'processed-{checkpoint.split("/")[1]}.pickle', 'wb') as handle:
    pickle.dump(pdf, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)