This file contains the code that was used to train the model. It is intended to be run in Google Colab.

The following cell downloads the training data for whatever model you want to train. Files named "split_train_{gram/disc/lexical}_eo.pickle" are the training files used to train the smaller models; the general model is trained on "split_train_eo.pickle". 

In [None]:
!wget https://raw.githubusercontent.com/lightcarrieson/error_classification/main/train/split_train_gram_eo.pickle

--2023-05-15 19:26:45--  https://raw.githubusercontent.com/lightcarrieson/dip/main/train/split_train_gram_eo.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 597336 (583K) [application/octet-stream]
Saving to: ‘split_train_gram_eo.pickle’


2023-05-15 19:26:45 (28.2 MB/s) - ‘split_train_gram_eo.pickle’ saved [597336/597336]



In [None]:
!pip install transformers==4.28.0 datasets evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import ClassLabel, Features, Dataset
from transformers import (BertTokenizer, BertForSequenceClassification, 
                          Trainer, TrainingArguments,
                          DataCollatorWithPadding, TextClassificationPipeline)
import pandas as pd
from tqdm.auto import tqdm
import torch
import pickle
import numpy as np
import evaluate

In [None]:
train = pickle.load(open('split_train_gram_eo.pickle', 'rb'))

In [None]:
train[:5]

[('for → over', 5),
 ('begin do → begin to do', 0),
 ('one decimal → one-tenth', 8),
 ('percents → percent', 8),
 ('housekeepers → housekeeping', 6)]

In [None]:
len(train)

20767

In the following file the relevant id2label and label2id need to be uncommented, different for every model.

In [None]:
# for general case
# id2label = {0 : 'lexical', 1: 'discourse', 2: 'gram'}
# label2id = {n : tag for tag, n in id2label.items()}

# for discourse errors
# id2label = {n: tag for n, tag in enumerate(['Absence_comp_sent', 'Absence_explanation', 
#                                             'Inappropriate_register', 'Linking_device', 
#                                             'Redundant_comp', 'Ref_device'])}
# label2id = {n : tag for tag, n in id2label.items()}

# for grammar errors
id2label = {n: tag for n, tag in enumerate(['Verb_pattern', 'Confusion_of_structures',
                                            'Voice', 'Comparison_degree', 
                                            'Formational_affixes', 'Prepositions', 
                                            'Category_confusion', 'Agreement_errors', 
                                            'Numerals', 'Tense_form', 'Relative_clause'])}

label2id = {n : tag for tag, n in id2label.items()}   

# for lexical errors
# label2id = {'lex_item_choice': 0, 'lex_part_choice': 1}
# id2label = {0: 'lex_item_choice', 1: 'lex_part_choice'}

In [None]:
id2label

{0: 'Verb_pattern',
 1: 'Confusion_of_structures',
 2: 'Voice',
 3: 'Comparison_degree',
 4: 'Formational_affixes',
 5: 'Prepositions',
 6: 'Category_confusion',
 7: 'Agreement_errors',
 8: 'Numerals',
 9: 'Tense_form',
 10: 'Relative_clause'}

In [None]:
label2id

{'Verb_pattern': 0,
 'Confusion_of_structures': 1,
 'Voice': 2,
 'Comparison_degree': 3,
 'Formational_affixes': 4,
 'Prepositions': 5,
 'Category_confusion': 6,
 'Agreement_errors': 7,
 'Numerals': 8,
 'Tense_form': 9,
 'Relative_clause': 10}

In [None]:
data = pd.DataFrame(train, columns=['text', 'label'])

In [None]:
data['tag'] = data.apply(lambda x: id2label[x['label']], axis=1)

In [None]:
data

Unnamed: 0,text,label,tag
0,for → over,5,Prepositions
1,begin do → begin to do,0,Verb_pattern
2,one decimal → one-tenth,8,Numerals
3,percents → percent,8,Numerals
4,housekeepers → housekeeping,6,Category_confusion
...,...,...,...
20762,makes → make,7,Agreement_errors
20763,time → times,7,Agreement_errors
20764,till → to,5,Prepositions
20765,in → by,5,Prepositions


In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

In [None]:
# MODEL = 'bert-large-uncased'
MODEL = 'roberta-base'

In [None]:
# tokenizer = BertTokenizer.from_pretrained(MODEL)
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
tokenizer(data['text'].tolist()[0])

{'input_ids': [0, 1990, 42484, 81, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:
input_ids, attention_mask, token_type_ids = [], [], []
for sent in tqdm(data['text'].tolist()):
  tok = tokenizer(sent, padding='max_length', max_length=40)
  input_ids.append(tok['input_ids'])
  attention_mask.append(tok['attention_mask'])
  # token_type_ids.append(tok['token_type_ids'])

  0%|          | 0/20767 [00:00<?, ?it/s]

In [None]:
data['input_ids'] = input_ids
data['attention_mask'] = attention_mask
# data['token_type_ids'] = token_type_ids

In [None]:
data

Unnamed: 0,text,label,tag,input_ids,attention_mask
0,for → over,5,Prepositions,"[0, 1990, 42484, 81, 2, 1, 1, 1, 1, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,begin do → begin to do,0,Verb_pattern,"[0, 43230, 109, 42484, 1642, 7, 109, 2, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,one decimal → one-tenth,8,Numerals,"[0, 1264, 46421, 42484, 65, 12, 90, 28249, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
3,percents → percent,8,Numerals,"[0, 1741, 438, 4189, 42484, 135, 2, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,housekeepers → housekeeping,6,Category_confusion,"[0, 3138, 15214, 42484, 790, 12609, 2, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
20762,makes → make,7,Agreement_errors,"[0, 39082, 42484, 146, 2, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20763,time → times,7,Agreement_errors,"[0, 958, 42484, 498, 2, 1, 1, 1, 1, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20764,till → to,5,Prepositions,"[0, 90, 1873, 42484, 7, 2, 1, 1, 1, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20765,in → by,5,Prepositions,"[0, 179, 42484, 30, 2, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
tr = data[['label', 'input_ids', 'attention_mask']]
dataset = Dataset.from_pandas(tr).class_encode_column('label')
dataset = dataset.train_test_split(test_size=0.2, seed=42)

Stringifying the column:   0%|          | 0/20767 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/20767 [00:00<?, ? examples/s]

In [None]:
dataset, dataset['train'].features

(DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 16613
     })
     test: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 4154
     })
 }),
 {'label': ClassLabel(names=['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9'], id=None),
  'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
  'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)})

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=len(id2label), id2label=id2label, label2id=label2id)
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=len(id2label), id2label=id2label, label2id=label2id)

In [None]:
training_args = TrainingArguments(
    output_dir='seqclassifier',
    load_best_model_at_end=True,
    save_total_limit=5,
    num_train_epochs=20,
    learning_rate=2e-6,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.521848,0.833414
2,No log,0.513141,0.834136
3,No log,0.506522,0.838469
4,0.497900,0.499062,0.83558
5,0.497900,0.499193,0.840154
6,0.497900,0.492814,0.837747
7,0.497900,0.487265,0.840636
8,0.448500,0.489189,0.84858
9,0.448500,0.48276,0.841117


Save the model to Google Drive (it will take too long do be saved from Colab as opposed to saving it to Drive first and the saving it locally from Drive) and Colab may disconnect during that time.

In [None]:
trainer.save_model('gram_rb')

In [None]:
!zip -r /content/gram_rb.zip /content/gram_rb

  adding: content/gram_rb/ (stored 0%)
  adding: content/gram_rb/training_args.bin (deflated 48%)
  adding: content/gram_rb/tokenizer.json (deflated 72%)
  adding: content/gram_rb/tokenizer_config.json (deflated 50%)
  adding: content/gram_rb/vocab.json (deflated 59%)
  adding: content/gram_rb/special_tokens_map.json (deflated 52%)
  adding: content/gram_rb/pytorch_model.bin (deflated 16%)
  adding: content/gram_rb/merges.txt (deflated 53%)
  adding: content/gram_rb/config.json (deflated 57%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
colab_link = '/content/gram_rb.zip'
gdrive_link = "/content/drive/MyDrive/models/"
shutil.copy(colab_link, gdrive_link)

'/content/drive/MyDrive/models/gram_rb.zip'