In [None]:
# Clone repo, install requirements
!python --version
!git clone https://github.com/OpenNMT/OpenNMT-py.git
%cd OpenNMT-py
!python setup.py install
!pip install -r /content/OpenNMT-py/requirements.opt.txt

# Remove default data
import shutil
shutil.rmtree('data')

# Create new data directory
!mkdir data
!mkdir data/src_tgt
!mv src_tgt.yaml data

In [None]:
# Create corpus object

import pandas as pd
import os
import csv

class Corpus:
  """
  Represents a parallel text dataset.
  """

  def __init__(self, path, firstcol, secondcol):
    self.path = path
    self.firstcol = firstcol
    self.secondcol = secondcol
  
  def validationsplit(self, src, tgt, outpath):
    """
    Split corpus into train, validation, and test data files.

    Parameters
    ----------
    src : text
      Name of source language (left column in corpus)
    tgt : text
      Name of target language (right column in corpus)
    """
    # Initialize DataFrame
    self.src = src
    self.tgt = tgt
    self.outpath = outpath
    self.df = df = pd.read_csv(self.path,
                               engine='python',
                               sep='\t',
                               encoding='utf-8',
                               header=None,
                               names=[self.firstcol,self.secondcol],
                               quoting=csv.QUOTE_NONE,
                               warn_bad_lines=True,
                               error_bad_lines=False).replace('"','',regex=True)

    self.df = self.df.sample(frac=1).reset_index(drop=True).dropna()

    # Generate train datasets
    src_train = self.df[src].iloc[:-5500]
    src_train.reset_index(inplace=True,drop=True)
    src_train_path = os.path.join(outpath,'src-train.txt')
    src_train.to_csv(src_train_path,header=False,index=False,encoding='utf-8')
    with open(src_train_path,encoding='utf-8') as f:
      src_train_len = len(f.readlines())

    tgt_train = self.df[tgt].iloc[:-5500]
    tgt_train.reset_index(inplace=True,drop=True)
    
    tgt_train_path = os.path.join(outpath,'tgt-train.txt')
    tgt_train.to_csv(tgt_train_path,header=False,index=False,encoding='utf-8')
    with open(tgt_train_path,encoding='utf-8') as f:
      tgt_train_len = len(f.readlines())

    assert src_train_len == tgt_train_len, "Train Datasets must be equal length"
    print('Source training dataset created at %s (%d lines)' % (src_train_path, src_train_len))
    print('Target training dataset created at %s (%d lines)' % (tgt_train_path, tgt_train_len))


    # Generate validation datasets
    src_val = self.df[src].iloc[-5500:-500]
    src_val.reset_index(inplace=True,drop=True)
    src_val_path = os.path.join(outpath,'src-val.txt')
    src_val.to_csv(src_val_path,header=False,index=False,encoding='utf-8')
    with open(src_val_path,encoding='utf-8') as f:
      src_val_len = len(f.readlines())

    tgt_val = self.df[tgt].iloc[-5500:-500]
    tgt_val.reset_index(inplace=True,drop=True)
    tgt_val_path = os.path.join(outpath,'tgt-val.txt')
    tgt_val.to_csv(tgt_val_path,header=False,index=False,encoding='utf-8')
    with open(tgt_val_path,encoding='utf-8') as f:
      tgt_val_len = len(f.readlines())
    
    assert src_val_len == tgt_val_len, "Validation Datasets must be equal length"
    print('Source validation dataset created at %s (%d lines)' % (src_val_path, src_val_len))
    print('Target validation dataset created at %s (%d lines)' % (tgt_val_path, tgt_val_len))


    # Generate test datasets
    src_test = self.df[src].iloc[-500:]
    src_test.reset_index(inplace=True,drop=True)
    src_test_path = os.path.join(outpath,'src-test.txt')
    src_test.to_csv(src_test_path,header=False,index=False,encoding='utf-8')
    with open(src_test_path,encoding='utf-8') as f:
      src_test_len = len(f.readlines())

    tgt_test = self.df[tgt].iloc[-500:]
    tgt_test.reset_index(inplace=True,drop=True)
    tgt_test_path = os.path.join(outpath,'tgt-test.txt')
    tgt_test.to_csv(tgt_test_path,header=False,index=False,encoding='utf-8')
    with open(tgt_test_path,encoding='utf-8') as f:
      tgt_test_len = len(f.readlines())

    assert src_test_len == tgt_test_len, "Test Datasets must be equal length"
    print('Source test dataset created at %s (%d lines)' % (src_test_path, src_test_len))
    print('Target test dataset created at %s (%d lines)' % (tgt_test_path, tgt_test_len))

In [None]:
# Instantiate a corpus object and run validation split
data = Corpus('en_sv.txt','EN','SV')
data.validationsplit(src='EN',tgt='SV',outpath='/content/OpenNMT-py/data/src_tgt')

In [None]:
# Train BPE
!python tools/learn_bpe.py -i data/src_tgt/src-train.txt -o data/src_tgt/src.bpe -s 16000
print('Trained src.bpe')
!python tools/learn_bpe.py -i data/src_tgt/tgt-train.txt -o data/src_tgt/tgt.bpe -s 16000
print('Trained tgt.bpe')

# Apply BPE  
!python tools/apply_bpe.py -c data/src_tgt/src.bpe -i data/src_tgt/src-train.txt -o data/src_tgt/src-train-bpe.txt
print('BPE applied to src-train')
!python tools/apply_bpe.py -c data/src_tgt/src.bpe -i data/src_tgt/src-val.txt -o data/src_tgt/src-val-bpe.txt
print('BPE applied to src-val')
!python tools/apply_bpe.py -c data/src_tgt/src.bpe -i data/src_tgt/src-test.txt -o data/src_tgt/src-test-bpe.txt
print('BPE applied to src-test')
!python tools/apply_bpe.py -c data/src_tgt/tgt.bpe -i data/src_tgt/tgt-train.txt -o data/src_tgt/tgt-train-bpe.txt
print('BPE applied to tgt-train')
!python tools/apply_bpe.py -c data/src_tgt/tgt.bpe -i data/src_tgt/tgt-val.txt -o data/src_tgt/tgt-val-bpe.txt
print('BPE applied to tgt-val')

In [None]:
# View GPU information before training
!nvidia-smi

In [None]:
# Generate vocab files
!onmt_build_vocab -config data/src_tgt.yaml -n_sample 50000

In [None]:
# Load tensorboard
%load_ext tensorboard
%tensorboard --logdir runs

# Create train directory and begin training
!mkdir data/src_tgt/train
!onmt_train -config data/src_tgt.yaml -tensorboard -tensorboard_log_dir runs

In [None]:
# Run inference on test data, based on best performing model checkpoint -- here I use model_step_200000.pt for demonstration
!onmt_translate -model data/src_tgt/train/model_step_200000.pt -src data/src_tgt/src-test.txt -output data/src_tgt/pred.txt -gpu 0 -replace_unk -verbose

# Detokenize byte-pair-encoded output
!sed -i 's/@@ //g' data/src_tgt/pred.txt

In [None]:
# Evaluate model performance using multi-bleu metric
!perl tools/multi-bleu.perl data/src_tgt/tgt-test.txt < data/src_tgt/pred.txt

In [None]:
# Create model directory
!mkdir /OpenNMT-py/data/src_tgt/models/en_sv

In [None]:
# Move src.bpe to model directory
!cp /OpenNMT-py/data/src_tgt/src.bpe /OpenNMT-py/data/src_tgt/models/en_sv/

In [None]:
# Install dependencies for model conversion
!pip install --upgrade pip
!pip install ctranslate2
!pip install torch===1.6.0 torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install OpenNMT-py

In [None]:
# Imports

# For model conversion
import ctranslate2

# For attention visualization
import seaborn as sns
import pandas as pd

# For tokenization
import codecs
from subword_nmt.apply_bpe import BPE

In [None]:
# Convert model based on saved model checkpoint -- here I use model_step_200000.pt for demonstration
converter = ctranslate2.converters.OpenNMTPyConverter("/OpenNMT-py/data/src_tgt/train/model_step_200000.pt")
output_dir = converter.convert('/OpenNMT-py/models/en_sv', model_spec="TransformerBase",force=True,quantization=None)

In [None]:
# Select translation and tokenization models

def init_model(translator_path,bpe_path):

  # Initialize Translator
  translator = ctranslate2.Translator(translator_path)

  # Initialize BPE model
  mypath = codecs.open(bpe_path)
  bpe = BPE(mypath)

  return translator, bpe

In [None]:
translator, bpe = init_model('/content/drive/My Drive/translation/models/russian/en_ru/',
                             OpenNMT-py/data/src_tgt/src.bpe)

In [None]:
# Run inference on cell input and return prediction with heatmap visualization of attention vectors

def translate():
  src = input('Enter input: ')
  src_bpe = bpe.segment(src).strip()

  src_tokens = src_bpe.split(' ')
  args = translator.translate_batch([src_tokens],return_attention=True)
  result = ' '.join(args[0][0]['tokens']).replace('@@ ','')

  print('\n')
  print(result)

  attention_vectors = args[0][0]['attention']
  attention_xlabels = src_tokens
  attention_ylabels = result.split(' ')

  ax = sns.heatmap(attention_vectors,xticklabels=attention_xlabels,yticklabels=attention_ylabels,cmap='Blues')

In [None]:
# Driver

translate()

ValueError: 'seagreen' is not a valid value for name; supported values are 'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', '...