<a href="https://colab.research.google.com/github/iyves/ru_col_suggest/blob/master/train_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This colab notebook extrapolates the process of learning static and dynamic word embedding models for the task of erroneous collocation correction. The models are trained on text from the CAT and cybercat corpora, with the text for each respective corpora in a .txt file. The file should consist of sentences delimited with newline. 

Training data can be in raw tokens or lemmatized tokens, and may contain PoS tags (i.e. lemma_N). Additionally, lemmatization can be done using MyStem, UDPipe, or TreeTagger.

**Note:** Run this colab with GPUs and High RAM. During training, do not leave this page and every hour or so click on a cell to ensure that the session remains active and doesn't prematurely disconnect.

In [None]:
# This code assumes that the training data is stored in gdrive at `./drive/MyDrive/Training_Data/`
from google.colab import drive
drive.mount('/content/drive/')

## Training of static word embeddings (w2v, fastText, GloVe)

In [None]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import FastText, Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

import tempfile
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load the training data
paths = [  
  # Lemmatized w/ treetagger
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_1.txt')),
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_2.txt')),
  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_treetagger_lemma.txt'))

  # Lemmatized w/ UDPipe
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_1.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_2.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_lemma.txt'))
]

class Corpus:
  def __iter__(self):
    for data_path in paths:
      for line in open(data_path, "r", encoding="utf-8"):
          yield utils.simple_preprocess(line)

sentences = Corpus()

# For evaluating the quality of the word2vec model during training after each epoch
# https://datascience.stackexchange.com/questions/9819/number-of-epochs-in-gensim-word2vec-implementation
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1

In [None]:
# Train w2v embeddings
# Note: make sure that the path exists beforehand: `./drive/MyDrive/models/lemma/w2v/`
CONTEXT_WINDOW = 5 # 5, 10
MIN_COUNT = 5
EPOCHS = 30
SIZE = 500 # 200, 300, 500

w2v_loss_logger = LossLogger()
w2v_model = Word2Vec(sentences=sentences, size=SIZE, window=CONTEXT_WINDOW, min_count=MIN_COUNT, workers=10, iter=EPOCHS, 
                     callbacks=[w2v_loss_logger], compute_loss=True,)
w2v_model.save('./drive/MyDrive/models/lemma/w2v/w2v_treetagger.model')
print(w2v_loss_logger.losses)

In [None]:
# Train fastText embeddings
# Note: make sure that the path exists beforehand: `./drive/MyDrive/models/lemma/fastText/`
CONTEXT_WINDOW = 5 # 5, 10
MIN_COUNT = 5
EPOCHS = 30
SIZE = 500 # 200, 300, 500

#note: the LossLogger doesn't work for fastText at this moment in time
fastText_model = FastText(sentences=sentences, size=SIZE, window=CONTEXT_WINDOW, min_count=MIN_COUNT, workers=10, iter=EPOCHS)
fastText_model.save('./drive/MyDrive/models/lemma/fastText/fastText_treetagger.model')

In [None]:
# Train GloVe embeddings
# https://stackoverflow.com/questions/48962171/how-to-train-glove-algorithm-on-my-own-corpus
from glove import Corpus, Glove

CONTEXT_WINDOW = 5 # 5, 10
# MIN_COUNT = 5 note: in future, figure out how to set min_count for GloVe via this library
EPOCHS = 30
SIZE = 500 # 200, 300, 500

#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus = Corpus()
corpus.fit(sentences, window=CONTEXT_WINDOW)

glove = Glove(no_components=SIZE, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=EPOCHS, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('./drive/MyDrive/models/lemma/glove/glove_treetagger.model')

In [None]:
# Convert GloVe embeddings into w2v format for use with the Gensim library
# https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html#Word-Embeddings,-GloVe-and-Text-classification
from gensim.scripts.glove2word2vec import glove2word2vec

glove_filename = "./drive/MyDrive/models/lemma/glove/glove_treetagger.model"
word2vec_output_file = glove_filename+'.word2vec'
glove2word2vec(glove_filename, word2vec_output_file)

## Training of dynamic word embeddings (RoBERTa)

In [None]:
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

In [None]:
# Train a tokenizer on the corpus 
# This step is necessary for training from scratch or if the pretrained model doesn't have a tokenizer

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer


paths = [  
  # Lemmatized w/ treetagger
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_1.txt')),
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_2.txt')),
  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_treetagger_lemma.txt'))

  # Lemmatized w/ UDPipe
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_1.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_2.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_lemma.txt'))
]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# ./drive/MyDrive/models/lemma/RuBERT_treetagger_lemma
# ./drive/MyDrive/models/lemma/RuBERT_udpipe_lemma

tokenizer.save_model("./drive/MyDrive/models/lemma/RuBERT_treetagger_lemma")

In [None]:
# Prepare the RoBERTa model for training
## Learning a RoBERTa base model from scratch
import torch
from pathlib import Path
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, RobertaConfig


config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

# ./drive/MyDrive/models/lemma/RuBERT_treetagger_lemma
# ./drive/MyDrive/models/lemma/RuBERT_udpipe_lemma
model_input = "./drive/MyDrive/models/lemma/RuBERT_treetagger_lemma"
save_dir = model_input

tokenizer = RobertaTokenizerFast.from_pretrained(model_input, max_len=512)
model = RobertaForMaskedLM(config=config)

In [None]:
# Prepare the RoBERTa model for training
## Fine-tuning a pre-trained model
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig

# DeepPavlov/rubert-base-cased-sentence
# ./drive/MyDrive/models/token/RuBERT_deeppavlov

# sberbank-ai/sbert_large_nlu_ru
# ./drive/MyDrive/models/token/RuBERT_sberbank
model_input = "DeepPavlov/rubert-base-cased-sentence"
save_dir = "./drive/MyDrive/models/token/RuBERT_deeppavlov"

config = AutoConfig.from_pretrained(model_input)
tokenizer = AutoTokenizer.from_pretrained(model_input)
model = AutoModelForMaskedLM.from_pretrained(model_input)

In [None]:
%%time
from transformers import LineByLineTextDataset # for loading in dataset
from transformers import DataCollatorForLanguageModeling # for batching
from transformers import Trainer, TrainingArguments

epochs = 5

# Load the training data
paths = [  
  # Lemmatized w/ treetagger
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_1.txt')),
  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_treetagger_lemma_2.txt')),
  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_treetagger_lemma.txt'))

  # Lemmatized w/ UDPipe
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_1.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/CAT_sentences_full_lemma_2.txt')),
  #  str(Path('./drive/MyDrive/Training_Data/cybercat_sentences_full_lemma.txt'))
]

for e in range(1, epochs+1) {
    curEpoch = f'Epoch[{e}/{epochs}] -'
    for file in paths {
        # At this moment, this class does not allow for loading multiple files at one :/
        print(curEpoch, f"Loading `{file}`")
        dataset = LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path=file,
            block_size=128,
        )
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        )
        print(curEpoch, f"Finished loading `{file}`")

        training_args = TrainingArguments(
          output_dir=save_dir,
          overwrite_output_dir=True,
          num_train_epochs=1,
          per_gpu_train_batch_size=32, # Restart runtime & modify this if GPU crashes from low memory: 32,16,8,4,1
          per_device_train_batch_size=32, # Restart runtime & modify this if GPU crashes from low memory: 32,16,8,4,1
          save_steps=10_000,
          save_total_limit=2,
      )

      # Make sure we are using the most recent model. I don't know if this is necessary; I know know if 'model' updates as it trains
      if e != 0 {
        model = AutoModelForMaskedLM.from_pretrained(save_dir)
      }

      trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=dataset
      )

      trainer.train()

      print(curEpoch, f"Saving model at `{save_dir}`")
      trainer.save_model(save_dir)
    }
}
print("Finished training!")