In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import time
import seaborn as sns
import shutil
from torch import optim
from torch.nn import functional as F
from IPython.display import clear_output
from IPython.utils import io

with io.capture_output() as captured:
  !pip install transformers sentencepiece sentence_transformers

from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import SentenceEvaluator
from torch.utils.data import DataLoader
from datetime import datetime
import math

from transformers import AdamW, AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

with io.capture_output() as captured:
  !pip install lingtrain_aligner dateparser razdel

from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, helper, vis_helper, metrics

sns.set()

In [4]:
df = pd.read_csv("train_texts.csv")

In [None]:
df.head()

In [7]:
train_data = []

for idx, row in df.iterrows():
  train_data.append({"ru": row["from"], "ro": row["to"]})

In [8]:
train_examples = [InputExample(texts=[x['ru'], x['ro']], label=1) for x in train_data]

In [9]:
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=6)

In [10]:
train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
def remove_utf_tags(text):
    pattern = re.compile(r'\\u[0-9a-fA-F]{4}')

    text = text.replace('\ufeff', '')

    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [11]:
text1 = df["from"].str.cat(sep=' ')

In [12]:
text2 = df["to"].str.cat(sep=' ')

In [18]:
class ChainScoreEvaluator(SentenceEvaluator):
  """Evaluate a lingtrain chain score. This score calculates coefficient of unbrokenness."""

  def __init__(self, db_path, lang_from, lang_to, text1, text2, scores):
    self.db_path = db_path
    self.lang_from = lang_from
    self.lang_to = lang_to
    self.text1 = text1
    self.text2 = text2
    self.scores = scores
    self.best_score = 0.0

  def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
    lines1_prepared = text1.split('\n')
    lines2_prepared = text2.split('\n')

    splitted_from = splitter.split_by_sentences_wrapper(lines1_prepared, lang_from)
    splitted_to = splitter.split_by_sentences_wrapper(lines2_prepared, lang_to)

    if os.path.isfile(db_path):
      os.unlink(db_path)
    aligner.fill_db(db_path, lang_from, lang_to, splitted_from, splitted_to)

    start_time = time.time()

    batch_ids = range(0,1)
    aligner.align_db(db_path,
                    model_name,
                    batch_size=200,
                    window=50,
                    batch_ids=batch_ids,
                    save_pic=False,
                    embed_batch_size=10,
                    normalize_embeddings=False,
                    show_progress_bar=False,
                    shift=0,
                    model=model
                    )

    print("\n--> epoch:", epoch, "steps:", steps)

    # print("\n--- %s seconds ---" % (time.time() - start_time))

    score1 = metrics.chain_score(db_path)

    if score1 > self.best_score:
      self.best_score = score1
      print("\nscore 1:", score1, "<-- new best score.")
      if self.best_score > 0.35:
        print("saving...")
        model.save('output/best_model')
        print("\nsaved")
    else:
      print("\nscore 1:", score1, "\n")

    # print("score 2:", metrics.chain_score(db_path, mode="both"))

    self.scores.append(score1)

    vis_helper.visualize_alignment_by_db(db_path,
            output_path="alignment_vis.png",
            batch_size=200,
            size=(600,600),
            lang_name_from=lang_from,
            lang_name_to=lang_to,
            plt_show=True)

    if steps%100==0:
      ax = sns.lineplot(data=scores)
      ax.set(xlabel='Step/100', ylabel='Chain Score')
      plt.show()

    return score1

scores = []
lang_from = "ru"
lang_to = "roma"
db_path = "alignment.db"

In [15]:
text1 = remove_utf_tags(text1)

In [16]:
text2 = remove_utf_tags(text2)

In [19]:
evaluator = ChainScoreEvaluator(db_path, lang_from, lang_to, text1, text2, scores)

In [20]:
torch.cuda.empty_cache()

In [None]:
model_name = 'labse'
train_batch_size = 6
model_save_path = 'output/labse_continue_training'
num_epochs = 8

warmup_steps = math.ceil(len(train_dataloader) * 0.1 * num_epochs)

model.fit(train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=num_epochs,
        evaluation_steps=100,
        output_path=model_save_path,
        save_best_model=True,
        use_amp=True,
        warmup_steps=warmup_steps)

In [None]:
!zip -r /content/file.zip /content/output/best_model

  adding: content/output/best_model/ (stored 0%)
  adding: content/output/best_model/config.json (deflated 53%)
  adding: content/output/best_model/tokenizer_config.json (deflated 46%)
  adding: content/output/best_model/2_Dense/ (stored 0%)
  adding: content/output/best_model/2_Dense/config.json (deflated 26%)
  adding: content/output/best_model/2_Dense/pytorch_model.bin (deflated 8%)
  adding: content/output/best_model/vocab.txt (deflated 43%)
  adding: content/output/best_model/tokenizer.json (deflated 65%)
  adding: content/output/best_model/pytorch_model.bin (deflated 7%)
  adding: content/output/best_model/modules.json (deflated 68%)
  adding: content/output/best_model/1_Pooling/ (stored 0%)
  adding: content/output/best_model/1_Pooling/config.json (deflated 49%)
  adding: content/output/best_model/README.md (deflated 54%)
  adding: content/output/best_model/sentence_bert_config.json (deflated 4%)
  adding: content/output/best_model/3_Normalize/ (stored 0%)
  adding: content/outp

In [None]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p "/content/drive/My Drive/RuskaRomaLabse"

In [None]:
!sudo cp -r "/content/output" "/content/drive/My Drive/RuskaRomaLabse"