This file is used to calculate the metrics on the CoNLL-2014 Shared Task test dataset.

In [1]:
conll_path = path/to/noalt/folder/from/conll/data

In [2]:
data = open(conll_path+'official-2014.1.sgml').read()

In [3]:
import re

In [4]:
text_capture = r"<TEXT>.*?</TEXT>"
texts = re.findall(text_capture, data, flags=re.DOTALL)
texts[0]

'<TEXT>\n<TITLE>\nKeeping the Secret of Genetic Testing\n</TITLE>\n<P>\nWhat is genetic risk? Genetic risk refers more to your chance of inheriting a disorder or disease. People get certain disease because of genetic changes. How much a genetic change tells us about your chance of developing a disorder is not always clear. If your genetic results indicate that you have gene changes associated with an increased risk of heart disease, it does not mean that you definitely will develop heart disease. The opposite is also true. If your genetic results show that you do not have changes associated with an increased risk of heart disease, it is still possible that you develop heart disease. However for some rare diseases, people who have certain gene changes are guaranteed to develop the disease. When we are diagonosed out with certain genetic disease, are we suppose to disclose this result to our relatives? My answer is no.\n</P>\n<P>\nOn one hand, we do not want this potential danger causing

In [5]:
annotation_capture = r"<ANNOTATION .*?</ANNOTATION>"
annotations = re.findall(annotation_capture, data, flags=re.DOTALL)

In [6]:
len(annotations), len(texts)

(50, 50)

In [7]:
for annotation in annotations:
    mistakes_capture = r"<MISTAKE .*?</MISTAKE>"
    mistakes = re.findall(mistakes_capture, annotation, flags=re.DOTALL)
print(mistakes[0])

<MISTAKE start_par="0" start_off="6" end_par="0" end_off="11">
<TYPE>Rloc-</TYPE>
<CORRECTION></CORRECTION>
</MISTAKE>


In [8]:
text_data = {}
for i in range(len(texts)):
    parts_ = texts[i].split('<P>')[1:]
    parts = [re.sub(r"<.*?>", "", p).strip() for p in parts_]
    text_data[i] = {'text' : parts}
    annotation = annotations[i]
    mistake_data = []
    mistakes = re.findall(r"<MISTAKE .*?</MISTAKE>", annotation, flags=re.DOTALL)
    for n, mistake in enumerate(mistakes):
        start_part = re.search(r'(?<=start_par=")[0-9]+?(?=")', mistake)[0]
        start_off = re.search(r'(?<=start_off=")[0-9]+?(?=")', mistake)[0]
        end_off = re.search(r'(?<=end_off=")[0-9]+?(?=")', mistake)[0]
        type_ = re.search(r'(?<=<TYPE>).+?(?=</TYPE>)', mistake)[0]
        correction = re.search(r'(?<=<CORRECTION>).*?(?=</CORRECTION>)', mistake, flags=re.DOTALL)[0]
        mistake_data.append({
            'part' : int(start_part),
            'start' : int(start_off),
            'end' : int(end_off),
            'type' : type_,
            'correction' : correction,
            })
        
    text_data[i]['mistakes'] = mistake_data
    
       

In [11]:
text_data[0]['mistakes'][:3]

[{'part': 1, 'start': 42, 'end': 46, 'type': 'Rloc-', 'correction': ''},
 {'part': 1, 'start': 118, 'end': 125, 'type': 'Nn', 'correction': 'diseases'},
 {'part': 1,
  'start': 597,
  'end': 604,
  'type': 'Vt',
  'correction': 'will develop'}]

In [12]:
errors = []
for text in text_data.values():
    if text['mistakes'][-1]['part'] == len(text['text']) or text['mistakes'][0]['part'] != 0:
        offset = 1
    else:
        offset = 0
    for mistake in text['mistakes']:
        error_span = text['text'][mistake['part']-offset][mistake['start']:mistake['end']] + ' → ' + mistake['correction']
        error_type = mistake['type']
        
        errors.append((error_span, error_type))
        

In [13]:
errors[567]

('support  → support,', 'Mec')

In [14]:
conll_to_realec = {
    'Vt' : 'Eliminated', # Choice of tense eliminated
    'Vm' : 'Eliminated', # Modals eliminated
    'Vform' : 'Tense_form',
    'V0' : 'Absence_comp_sent',
    'SVA' : 'Agreement_errors',
    'ArtOrDet' : 'Eliminated', # Articles, Determiners eliminated
    'Nn' : 'Eliminated', # Noun number eliminated
    'Npos' : 'Eliminated', # Possessive noun eliminated
    'Pform' : 'Eliminated', # Pronouns eliminated
    'Pref' : 'Ref_device', 
    'Prep' : 'Prepositions',
    'Wci' : 'lex_', # applies to both lexical tags in REALEC
    'Wa' : 'N/A', # Acronyms : no equivalent tag in REALEC
    'Wform' : 'Category_confusion',
    'Wtone' : 'Inappropriate_register',
    'Srun' : 'Eliminated', # Punctuation eliminated
    'Smod' : 'Absence_comp_sent', # Dangling modifiers
    'Spar' : 'Eliminated', # Parallel contruction eliminated
    'Sfrag' : 'N/A', # Sentence fragment : no equivalent tag in REALEC
    'Ssub' : 'Relative_clause',
    'WOinc' : 'Eliminated', # Word order eliminated
    'WOadv' : 'Eliminated', # Word order eliminated
    'Trans' : 'Linking_device',
    'Mec' : 'Eliminated', # Spelling, Punctuation, Capitalisation eliminated
    'Rloc-' : 'Redundant_comp',
    'Cit' : 'N/A', # citation : no equivalent in REALEC
    'Others' : 'N/A',
    'Um' : 'Eliminated' # Coherence eliminated
}

In [108]:
from transformers import (TextClassificationPipeline,
                          RobertaTokenizerFast, RobertaForSequenceClassification)
from tqdm.auto import tqdm

The following is the pipeline for the fine-tuned model.

In [109]:
general_model_path = ...
discourse_model_path = ...
grammar_model_path = ...
lexical_model_path = ...

In [110]:
general = TextClassificationPipeline(
    model=RobertaForSequenceClassification.from_pretrained(general_model_path),
    tokenizer=RobertaTokenizerFast.from_pretrained(general_model_path),
    top_k=None
)

discourse = TextClassificationPipeline(
    model=RobertaForSequenceClassification.from_pretrained(discourse_model_path),
    tokenizer=RobertaTokenizerFast.from_pretrained(discourse_model_path),
)

grammar = TextClassificationPipeline(
    model=RobertaForSequenceClassification.from_pretrained(grammar_model_path),
    tokenizer=RobertaTokenizerFast.from_pretrained(grammar_model_path),
)

lexical = TextClassificationPipeline(
    model=RobertaForSequenceClassification.from_pretrained(lexical_model_path),
    tokenizer=RobertaTokenizerFast.from_pretrained(lexical_model_path),
)

In [113]:
real = {
    'Verb_pattern': 'Verb_pattern',
 'Confusion_of_structures': 'Confusion_of_structures',
 'Comparison_degree': 'Voice',
 'Formational_affixes': 'Comparison_degree',
 'Prepositions': 'Formational_affixes',
 'Category_confusion': 'Prepositions',
 'Agreement_errors': 'Category_confusion',
 'Numerals': 'Agreement_errors',
 'Tense_form': 'Numerals',
 'Voice': 'Relative_clause',
 'Relative_clause': 'Tense_form'
}

In [160]:
answers = []
for sent, tag in tqdm(errors):
    general_class = general(sent)[0]
    general_label = general_class[0]['label']
    
    if general_class[0]['score'] > 0.85:
        if general_label[0] == 'l':
            pred = lexical(sent)[0]
            label, score = pred['label'], pred['score']
        elif general_label[0] == 'd':
            pred = discourse(sent)[0]
            label, score = pred['label'], pred['score']
        else:
            pred = grammar(sent)[0]
            label, score = real[pred['label']], pred['score']
            
        answers.append((label, score > 0.7))
        
    else:
        
        general_classes = {i['label']:i['score'] for i in general_class}
        
        lex_gen_lik = general_classes['lexical']
        disc_gen_lik = general_classes['discourse']
        gram_gen_lik = general_classes['gram']
        
        lex = lexical(sent)[0]
        disc = discourse(sent)[0]
        gram = grammar(sent)[0]
        
        lex_lik = lex['score'] * lex_gen_lik
        disc_lik = disc['score'] * disc_gen_lik
        gram_lik = gram['score'] * gram_gen_lik
        
        if max(lex_lik, disc_lik, gram_lik) == lex_lik:
            answers.append((lex['label'], lex_lik > 0.63))
        elif max(lex_lik, disc_lik, gram_lik) == disc_lik:
            answers.append((disc['label'], disc_lik > 0.63))
        else:
            answers.append((gram['label'], gram_lik > 0.63))
            

  0%|          | 0/3331 [00:00<?, ?it/s]

In [163]:
conll_answers = [i[1] for i in errors]

In [164]:
assert len(conll_answers) == len(answers)

In [165]:
import pandas as pd

In [166]:
df = pd.DataFrame(columns=['error', 'conll', 'pred', 'accepted', 'prediction correct'])
i, idx = 0, 0

for target, pred in zip(conll_answers, answers):
    if conll_to_realec[target] not in ['Eliminated', 'N/A']:
        df.loc[idx] = [errors[i][0], target, pred[0], pred[1], pred[0].startswith(conll_to_realec[target])]
        idx += 1
    i += 1

In [173]:
len(conll_answers)

3331

In [167]:
df

Unnamed: 0,error,conll,pred,accepted,prediction correct
0,more →,Rloc-,lex_item_choice,False,False
1,diagonosed out → diagnosed,Rloc-,lex_item_choice,True,False
2,causing → to have,Wci,lex_item_choice,True,True
3,in → on,Prep,Prepositions,True,True
4,caring → caring about,Prep,Verb_pattern,True,False
...,...,...,...,...,...
1710,". So → , and so",Trans,Linking_device,True,True
1711,does → do,SVA,Agreement_errors,True,True
1712,in → for,Prep,Prepositions,True,True
1713,is → are,SVA,Agreement_errors,True,True


In [170]:
accepted = df[df['accepted'] == True]

In [171]:
accepted[accepted['prediction correct']]

Unnamed: 0,error,conll,pred,accepted,prediction correct
2,causing → to have,Wci,lex_item_choice,True,True
3,in → on,Prep,Prepositions,True,True
6,ways → things,Wci,lex_item_choice,True,True
10,go → do,Wci,lex_item_choice,True,True
14,absolute → absolutely,Wform,Category_confusion,True,True
...,...,...,...,...,...
1710,". So → , and so",Trans,Linking_device,True,True
1711,does → do,SVA,Agreement_errors,True,True
1712,in → for,Prep,Prepositions,True,True
1713,is → are,SVA,Agreement_errors,True,True


In [186]:
accepted[accepted['prediction correct'] == False]

Unnamed: 0,error,conll,pred,accepted,prediction correct
1,diagonosed out → diagnosed,Rloc-,lex_item_choice,True,False
4,caring → caring about,Prep,Verb_pattern,True,False
9,from the scope of → with,Prep,Category_confusion,True,False
11,health conditions → health,Rloc-,lex_part_choice,True,False
18,or → it or,Pref,Absence_comp_sent,True,False
...,...,...,...,...,...
1686,as → in view of,Prep,lex_part_choice,True,False
1693,the help of →,Rloc-,lex_item_choice,True,False
1696,area →,Rloc-,lex_item_choice,True,False
1708,young → younger,Wform,Comparison_degree,True,False


In [210]:
realec_to_conll = {v:k for k,v in conll_to_realec.items()}
realec_to_conll['lex_item_choice'] = 'Wci'
realec_to_conll['lex_part_choice'] = 'Wci'
realec_to_conll

{'Eliminated': 'Um',
 'Tense_form': 'Vform',
 'Absence_comp_sent': 'Smod',
 'Agreement_errors': 'SVA',
 'Ref_device': 'Pref',
 'Prepositions': 'Prep',
 'lex_': 'Wci',
 'N/A': 'Others',
 'Category_confusion': 'Wform',
 'Inappropriate_register': 'Wtone',
 'Relative_clause': 'Ssub',
 'Linking_device': 'Trans',
 'Redundant_comp': 'Rloc-',
 'lex_item_choice': 'Wci',
 'lex_part_choice': 'Wci'}

In [250]:
def prec_rec_f1(cl):
    
    total = len(accepted[accepted['conll'] == cl])
    corr = accepted[accepted['prediction correct']]
    correct = len(corr[corr['pred'].str.startswith(conll_to_realec[cl])])
    
    TP = len(accepted[(accepted['conll'] == cl) & (accepted['pred'].str.startswith(conll_to_realec[cl]))])
    FP = len(accepted[(accepted['conll'] != cl) & (accepted['pred'].str.startswith(conll_to_realec[cl]))])
    TN = len(accepted[(accepted['conll'] != cl) & ~(accepted['pred'].str.startswith(conll_to_realec[cl]))])
    FN = len(accepted[(accepted['conll'] == cl) & ~(accepted['pred'].str.startswith(conll_to_realec[cl]))])
    
    
    try:
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1 = TP / (TP + .5 * (FP + FN))
        f05 = 1.25 * TP / (1.25 * TP + 0.25 * FN + FP)    
        return precision, recall, f1, f05, total, correct
    except Exception:
        print(f'for class {cl} TP = {TP}, FP = {FP}, TN = {TN}, FN = {FN}')
        return 0, 0, 0, 0, total, correct
    

In [251]:
prec_rec_f1('Wci')

(0.7061143984220908,
 0.9445910290237467,
 0.8081264108352144,
 0.7436643124221022,
 379,
 358)

In [252]:
corr = accepted[accepted['prediction correct']]
corr[corr['pred'].str.startswith(conll_to_realec['Wci'])]
# corr

Unnamed: 0,error,conll,pred,accepted,prediction correct
2,causing → to have,Wci,lex_item_choice,True,True
6,ways → things,Wci,lex_item_choice,True,True
10,go → do,Wci,lex_item_choice,True,True
19,system → code,Wci,lex_item_choice,True,True
23,result → come,Wci,lex_item_choice,True,True
...,...,...,...,...,...
1658,→ sending,Wci,lex_item_choice,True,True
1666,hat ti → complete,Wci,lex_item_choice,True,True
1673,ke t → look,Wci,lex_item_choice,True,True
1694,wherever → whenever,Wci,lex_item_choice,True,True


In [255]:
classes = set(df['conll'].tolist())    

print(len(classes), classes)
pr, rec, f1, f05, totals, corrects  = [], [], [], [], [], []
for cl in classes:
        m1, m2, m3, m4, m5, m6 = prec_rec_f1(cl)
        pr.append(round(m1, 2))
        rec.append(round(m2, 2))
        f1.append(round(m3, 2))
        f05.append(round(m4, 2))
        totals.append(m5)
        corrects.append(m6)

    
metrics = pd.DataFrame(data = list(zip(classes, pr, rec, f1, f05, totals)), 
                       columns=['class', 'precision', 'recall', 'F1', 'F0.5', 'in CoNLL dataset'])

metrics

12 {'Vform', 'Ssub', 'Smod', 'Pref', 'V0', 'Rloc-', 'SVA', 'Wtone', 'Prep', 'Wform', 'Trans', 'Wci'}
for class Vform TP = 0, FP = 0, TN = 1198, FN = 48
for class Smod TP = 0, FP = 57, TN = 1189, FN = 0


Unnamed: 0,class,precision,recall,F1,F0.5,in CoNLL dataset
0,Vform,0.0,0.0,0.0,0.0,48
1,Ssub,0.0,0.0,0.0,0.0,3
2,Smod,0.0,0.0,0.0,0.0,0
3,Pref,0.91,0.77,0.84,0.88,96
4,V0,0.28,0.8,0.42,0.32,20
5,Rloc-,1.0,0.01,0.02,0.05,98
6,SVA,0.91,0.97,0.94,0.92,149
7,Wtone,0.5,0.09,0.15,0.26,11
8,Prep,0.99,0.86,0.92,0.96,321
9,Wform,0.34,0.38,0.36,0.35,68


In [272]:
tag = accepted[accepted['conll'] == 'Wtone']
tag[tag['pred'] != 'Inappropriate_register']

Unnamed: 0,error,conll,pred,accepted,prediction correct
225,for sure →,Wtone,lex_item_choice,True,False
446,In a nutshell → In summary,Wtone,lex_part_choice,True,False
770,In a nutshell → In summary,Wtone,lex_part_choice,True,False
857,To put it in the nutshell → In summary,Wtone,Linking_device,True,False
1128,guy → man,Wtone,lex_item_choice,True,False
1190,persons → people,Wtone,lex_item_choice,True,False
1543,stuff → things,Wtone,Numerals,True,False
1577,showing up → appearing,Wtone,lex_item_choice,True,False
1606,recently → recent,Wtone,Category_confusion,True,False
1709,To put it in the nutshell → In summary,Wtone,Linking_device,True,False


In [271]:
tag = accepted[accepted['conll'] == 'Vform']
tag[tag['pred'] != 'Tense_form']

Unnamed: 0,error,conll,pred,accepted,prediction correct
32,pass → passing,Vform,Category_confusion,True,False
44,indicates → indicate,Vform,Agreement_errors,True,False
46,carry → carrying,Vform,Category_confusion,True,False
71,intake → taking in,Vform,lex_item_choice,True,False
110,have → having,Vform,Category_confusion,True,False
128,eat → eating,Vform,Category_confusion,True,False
145,keeping → keep,Vform,Category_confusion,True,False
155,dishearten → disheartening,Vform,Formational_affixes,True,False
171,knowing → to know,Vform,Verb_pattern,True,False
176,adding → add,Vform,Category_confusion,True,False


In [200]:
smod = [e for e in errors if e[1] == 'Smod']
smod

[('Especially for the young people without marrige, if → This is especially the case for young people who are unmarried. If',
  'Smod'),
 ('By telling  → If they tell', 'Smod'),
 ('By making sure that they are aware of this genetic problem → If they are made aware of this genetic problem',
  'Smod'),
 ('without informing → without their having informed', 'Smod'),
 ('After realizing  → After he or she realizes', 'Smod')]

In [201]:
len(errors)

3331