In [1]:
!pip install transformers
!pip install seqeval
!pip install sentencepiece
import sentencepiece as spm

from seqeval.metrics import precision_score as seq_precision, recall_score as seq_recall, f1_score as seq_f1
from transformers import AutoTokenizer, XLMRobertaModel, XLMRobertaForMaskedLM
import json
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import os
import torch
import numpy as np
import torch.nn as nn 
from torch.nn import functional as F
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from easydict import EasyDict
import gc
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.optim import Adam
import pickle
import re

logger = logging.getLogger(__name__)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.8MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 28.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=00d1b45

# Model

In [2]:

def num_parameters(parameters):
    num = 0
    for i in parameters:
        num += len(i)
    return num
class Detector(nn.Module):
    def __init__(self, input_dim,output_dim,  embedding_dim, num_layers, hidden_size):

        super(Detector, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embedding_dim  = embedding_dim
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings = self.input_dim, embedding_dim = self.embedding_dim, )
        self.LSTM = nn.LSTM(input_size = self.embedding_dim, hidden_size= self.hidden_size, num_layers = self.num_layers, 
                            batch_first = True, dropout = 0.1, bidirectional = True)
        self.linear = nn.Linear(self.hidden_size*2, self.output_dim)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        emb = self.embedding(x)
        outputs, (h_n, h_c) = self.LSTM(emb)
        logits = self.linear(outputs)

        p = self.sigmoid(logits)
        return p


class HardMasked(nn.Module):
    def __init__(self, detector, MaskedLM, detector_tokenizer, maskedlm_tokenzier,device ):
        super(HardMasked, self).__init__()

        self.detector = detector.to(device)
        self.MaskedLM = MaskedLM.to(device)
        self.detector_tokenizer = detector_tokenizer
        self.maskedlm_tokenizer = maskedlm_tokenizer
        self.use_device = device


    def forward(self, s):
        maskedlm_features = self.prepare_input(s)
        outputs = MaskedLM(input_ids = torch.tensor([maskedlm_features['input_ids']], dtype = torch.long, device = self.use_device), 
                            attention_mask = torch.tensor([maskedlm_features['attention_mask']], dtype = torch.long, device = self.use_device) )
        logits = outputs['logits'][0]
        output_ids = torch.argmax(logits, dim = -1)
        final_output = maskedlm_tokenizer.decode(output_ids)
        return final_output


    def prepare_input(self, s):

        detector_input_ids = self.detector_tokenizer.encode(s, out_type = int)
        detector_input_pieces = self.detector_tokenizer.id_to_piece(detector_input_ids)
        detector_outputs = (self.detector(torch.tensor([detector_input_ids], dtype = torch.long, device = self.use_device))[0].reshape(1,-1) > 0.5).int()[0] 

        for i in range(1, len(detector_input_pieces)):
            if detector_outputs[i] == 1:
                detector_input_pieces[i] = ' <mask>'

        masked_s = self.detector_tokenizer.decode(detector_input_pieces)
        for i in range(5):
            masked_s = re.sub(r'<mask>\s<mask>', '<mask>', masked_s)

        maskedlm_features = maskedlm_tokenizer(masked_s)

        return maskedlm_features

        
            





In [6]:
# Load detector and XLM-R masked language model to create Hard-Masked XLM-R
detector_path = '/content/drive/MyDrive/nlp_projects/Text_correction/all_data/Detector.pkl'

MaskedLM = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base')

maskedlm_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

detector_tokenizer_path = '/content/drive/MyDrive/nlp_projects/Text_correction/spm_tokenizer.model'

detector_tokenizer = spm.SentencePieceProcessor(detector_tokenizer_path, )

detector = torch.load(detector_path)

In [7]:
model = HardMasked(detector, MaskedLM, detector_tokenizer, maskedlm_tokenizer, 'cuda')

In [31]:
s = 'Tôi vẫn luôn iu cô ấy với hết tấm lòng của mk'

In [32]:
model(s)

'<s> Tôi vẫn luôn yêu cô ấy với hết tấm lòng của mình</s>'