In [2]:
!pip install numpy requests nlpaug



In [3]:
!pip install nltk>=3.4.5

In [4]:
pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [5]:
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_glove(model_name='glove.840B.300d', dest_dir='.') # Download GloVe model
!pip install gensim>=4.1.2

In [6]:
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install pyskiplist
!pip install fitlog

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.6/553.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=592712 sha256=a986a7bc811a1b4a66a3c67809971219b94fd5cff7e25d571317270b306ce525
  Stored in directory: /tmp/pip-ephem-wheel-cache-tiz0wi5w/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0
Collecting pyskiplist
  Downloading pyskiplist-1.0.0-py2.py

In [7]:
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu
from pyskiplist import SkipList
import fitlog
import kenlm
import statistics

In [8]:
import os
import sys
import time
import argparse

import torch
import torch.nn as nn
from torch import cuda
import torch.nn.functional as F
from transformers import T5Tokenizer

sys.path.append(".")

In [9]:
def collate_fn(insts, pad_token_id=1):
    ''' Pad the instance to the max seq length in batch '''

    max_len = max(len(inst) for inst in insts)
    max_len = max_len if max_len > 4 else 5

    batch_seq = np.array([
        inst + [pad_token_id] * (max_len - len(inst))
        for inst in insts])
    batch_seq = torch.LongTensor(batch_seq)

    return batch_seq

In [10]:
class CNNDataset(torch.utils.data.Dataset):
    def __init__(self, insts, label):
        self.insts = insts
        self.label = label

    def __getitem__(self, index):
        return self.insts[index], self.label[index]

    def __len__(self):
        return len(self.insts)

In [11]:
#to evaluate the style classifier 
def evaluate_sc(model, valid_loader, loss_fn, epoch):
    '''Evaluation function for style classifier'''
    model.eval()
    total_acc = 0.
    total_num = 0.
    total_loss = 0.
    with torch.no_grad():
        for batch in valid_loader:
            x_batch, y_batch = map(lambda x: x.to(device), batch)
            logits = model(x_batch)
            total_loss += loss_fn(logits, y_batch)
            _, y_hat = torch.max(logits,dim=-1)
            same = [float(p == q) for p, q in zip(y_batch, y_hat)]
#             print("same for evaluation:",same)
            total_acc += sum(same)
            total_num += len(y_batch)
    model.train()
    print('[Info] Epoch {:02d}-valid: {}'.format(
                epoch, 'acc {:.4f}% | loss {:.4f}').format(
        total_acc / total_num * 100, total_loss / total_num))

    return total_acc / total_num, total_loss / total_num

In [12]:
# to iterate over the style classifier
def SCIterator(insts_0, insts_1, opt, pad_token_id=1, shuffle=True):
    '''Data iterator for style classifier'''

    def cls_fn(insts):
        insts, labels = list(zip(*insts))
        seq = collate_fn(insts, pad_token_id)
        labels = torch.LongTensor(labels)
        return (seq, labels)

    num = len(insts_0) + len(insts_1)
    loader = torch.utils.data.DataLoader(
        CNNDataset(
            insts=insts_0 + insts_1,
            label=[0 if i < len(insts_0)
                   else 1 for i in range(num)]),
        shuffle=shuffle,
        num_workers=2,
        collate_fn=cls_fn,
        batch_size=opt.batch_size)

    return loader

In [13]:

def load_embedding(tokenizer, embed_dim, embed_path=None):
    '''Parse an embedding text file into an array.'''
    sys.path.append(".")
    embedding = np.random.normal(scale=embed_dim ** -0.5,
                                 size=(len(tokenizer), embed_dim))
    if embed_path is None:
        return embedding

    print('[Info] Loading embedding')
    embed_dict = {}
    with open(embed_path, 'r') as file:
        for i, line in enumerate(file):
            if i == 0:
                continue
            tokens = line.rstrip().split()
            try:
                embed_dict[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
            except:
                continue

    for i in range(len(tokenizer)):
        try:
            word = tokenizer.decode(i)
            if word in embed_dict:
                embedding[i] = embed_dict[word]
        except:
            print(i)

    return embedding

In [14]:
import math
import numpy as np


class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, lr, decay_step = 1000, 
                       decay_rate=0.9, steps=0):
        self.init_lr = lr
        self.steps = steps
        self._optimizer = optimizer
        self.decay_rate = decay_rate
        self.decay_step = decay_step

    def step(self):
        '''Step with the inner optimizer'''
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''
        self.steps += 1
        if self.steps >= self.decay_step:
            lr = self.init_lr * math.pow(self.decay_rate, 
                                         int(self.steps / self.decay_step))
            for param_group in self._optimizer.param_groups:
                param_group['lr'] = lr
        else:
            for param_group in self._optimizer.param_groups:
                param_group['lr'] = self.init_lr

In [15]:
sys.path.append(".")

filter_sizes = [1, 2, 3, 4, 5]
num_filters = [128, 128, 128, 128, 128]
device = 'cuda' if cuda.is_available() else 'cpu'
special_tokens = [{'bos_token': '<bos>'},
                  {'eos_token': '<eos>'}, {'sep_token': '<sep>'},
                  {'pad_token': '<pad>'}, {'unk_token': '<unk>'}]


class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, embeding):
        super(EmbeddingLayer, self).__init__()
        self.embeding = nn.Embedding(vocab_size, embed_dim)
        if embeding is not None:
            self.embeding.weight.data = torch.FloatTensor(embeding)

    def forward(self, x):
        if len(x.size()) == 2:
            y = self.embeding(x)
        else:
            y = torch.matmul(x, self.embeding.weight)
        return y

In [16]:
class TextCNN(nn.Module):
    '''A style classifier TextCNN'''

    def __init__(self, embed_dim, vocab_size, filter_sizes, 
                 num_filters, embedding=None, dropout=0.0):
        super(TextCNN, self).__init__()

        self.feature_dim = sum(num_filters)
        self.embeder = EmbeddingLayer(vocab_size, embed_dim, embedding)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, n, (f, embed_dim))
            for (n, f) in zip(num_filters, filter_sizes)
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Sequential(
            self.dropout,
            nn.Linear(self.feature_dim, int(self.feature_dim / 2)), nn.ReLU(),
            nn.Linear(int(self.feature_dim / 2), 2)
        )
    def forward(self, inp):
        inp = self.embeder(inp).unsqueeze(1)
        convs = [F.relu(conv(inp)).squeeze(3) for conv in self.convs]
        pools = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in convs]
        out = torch.cat(pools, 1)
        logit = self.fc(out)

        return logit

    def build_embeder(self, vocab_size, embed_dim, embedding=None):
        embeder = nn.Embedding(vocab_size, embed_dim)
        nn.init.normal_(embeder.weight, mean=0, std=embed_dim ** -0.5)
        if embedding is not None:
            embeder.weight.data = torch.FloatTensor(embedding)

        return embeder

In [17]:
parser = argparse.ArgumentParser('Style Classifier TextCNN')
parser.add_argument('-lr', default=1e-3, type=float, help='learning rate')
parser.add_argument('-dataset', default='em', type=str, help='the name of dataset')
parser.add_argument('-embed_dim', default=300, type=int, help='the embedding size')
parser.add_argument('-seed', default=42, type=int, help='pseudo random number seed')
parser.add_argument('-min_count', default=0, type=int, help='minmum number of corpus')
parser.add_argument("-dropout", default=0.5, type=float, help="Keep prob in dropout.")
parser.add_argument('-max_len', default=50, type=int, help='maximum tokens in a batch')
parser.add_argument('-log_step', default=100, type=int, help='print log every x steps')
parser.add_argument('-eval_step', default=1000, type=int, help='early stopping training')
parser.add_argument('-batch_size', default=32, type=int, help='maximum sents in a batch')
parser.add_argument('-epoch', default=50, type=int, help='force stop at specified epoch')

opt = parser.parse_args(['-lr', '0.001', '-dataset', 'em', '-embed_dim', '300', '-seed', '42', '-min_count', '0', '-dropout', '0.5', '-max_len', '50', '-log_step', '100', '-eval_step', '1000', '-batch_size', '32', '-epoch', '50'])
torch.manual_seed(opt.seed)

tokenizer = T5Tokenizer.from_pretrained('t5-base')

train_src, train_tgt, valid_src, valid_tgt = [], [], [], []
train_dict={}
with open('/kaggle/input/dataproject/data/{}/train/informal'.format(opt.dataset),'r') as f:
    for line in f.readlines():
        t=tokenizer.encode(line.strip())[:opt.max_len]
        train_dict[line]=t
        train_src.append(t)
with open('/kaggle/input/dataproject/data/{}/train/formal'.format(opt.dataset),'r') as f:
    for line in f.readlines():
        train_tgt.append(tokenizer.encode(line.strip())[:opt.max_len])
with open('/kaggle/input/dataproject/data/{}/tune/informal'.format(opt.dataset),'r') as f:
    for line in f.readlines():
        valid_src.append(tokenizer.encode(line.strip())[:opt.max_len])
with open('/kaggle/input/dataproject/data/{}/tune/formal'.format(opt.dataset),'r') as f:
    for line in f.readlines():
        valid_tgt.append(tokenizer.encode(line.strip())[:opt.max_len])


print('[Info] {} instances from train set'.format(len(train_src)))
print('[Info] {} instances from valid set'.format(len(valid_tgt)))
train_loader = SCIterator(train_src, train_tgt, opt)
valid_loader = SCIterator(valid_src, valid_tgt, opt)
print(train_loader)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[Info] 52595 instances from train set
[Info] 2356 instances from valid set
<torch.utils.data.dataloader.DataLoader object at 0x7eecd4970940>


In [18]:
for i in range(5):
    print(train_src[i])
    print(train_tgt[i])
    print("-"*25)

[8, 1974, 37, 86, 18, 3612, 210, 7, 59, 1776, 3, 9, 2297, 1974, 68, 6613, 11, 207, 55, 1]
[37, 86, 18, 3612, 210, 7, 1974, 19, 29, 31, 17, 3, 9, 2297, 1974, 6, 68, 34, 31, 7, 8957, 5, 1]
-------------------------
[24, 543, 410, 59, 428, 140, 5931, 8115, 7, 599, 23, 317, 61, 1]
[27, 278, 31, 17, 317, 24, 543, 1891, 140, 19601, 5, 1]
-------------------------
[13, 4301, 7, 15, 3, 23, 36, 3, 10674, 77, 34, 3, 15, 208, 651, 239, 6, 82, 3, 89, 9, 208, 3, 4059, 1836, 449, 19, 86, 76, 3198, 9, 1]
[27, 1605, 34, 4604, 6, 82, 1305, 3, 4059, 1836, 449, 19, 86, 76, 3198, 9, 5, 1]
-------------------------
[661, 15, 12002, 5, 287, 41, 2258, 1082, 333, 34, 61, 3, 184, 694, 1939, 77, 5, 287, 41, 29117, 138, 61, 1]
[9259, 1939, 77, 5, 287, 11, 661, 15, 12002, 5, 287, 33, 248, 21, 384, 694, 5, 1]
-------------------------
[27, 7, 3, 88, 16998, 58, 3845, 47, 30, 13294, 5190, 28, 1193, 152, 411, 31, 279, 3483, 11, 3, 88, 3776, 1134, 16998, 1]
[216, 47, 30, 8, 13294, 5190, 504, 28, 1193, 152, 411, 31, 27

In [19]:
# Assuming 'train_loader' is your DataLoader object
for batch_idx, batch in enumerate(train_loader):
    if batch_idx < 2:  # Print only the first three batches
        print(f"Batch {batch_idx}:")
        # Assuming the batch contains features and labels
        features, labels = batch
        print("Features:")
        print(features)
        print("Labels:")
        print(labels)
        print("-" * 50)  # Print separator
    else:
        break  # Stop after printing the first three batches

Batch 0:
Features:
tensor([[  156,    34,    19,  ...,     1,     1,     1],
        [   34,   373,  1330,  ...,     1,     1,     1],
        [ 9034,  7105,    19,  ...,     1,     1,     1],
        ...,
        [   27,  1971,  3355,  ...,     1,     1,     1],
        [  696,  2039,    19,  ...,     1,     1,     1],
        [16497,    27,    54,  ...,     1,     1,     1]])
Labels:
tensor([1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0])
--------------------------------------------------
Batch 1:
Features:
tensor([[   27,   317,    25,   228,   169,  3306, 13601,    15,     5,   299,
             3,    99,    24,   744,    31,    17,   161,     6,    25,    54,
          3442,  3190,     7,    11,  2025,   135,    12,    39,  1218,     5,
             1],
        [ 1129,   503,     6, 18981,   138, 15343,     5,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1, 

In [20]:
if os.path.exists(f'/kaggle/working/checkpoints/{opt.dataset}_embedding.pt'):
        embedding = torch.load(f'./checkpoints/{opt.dataset}_embedding.pt')
else:
    embed_path = '/kaggle/working/glove.840B.300d.txt'
    embedding = load_embedding(tokenizer, 300, embed_path)
    torch.save(embedding, f'/kaggle/working/{opt.dataset}_embedding.pt')

[Info] Loading embedding


2024-05-06 03:08:31.546781: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 03:08:31.546894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 03:08:31.673610: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [21]:
model = TextCNN(opt.embed_dim, len(tokenizer), filter_sizes, 
                    num_filters, embedding=embedding, dropout=opt.dropout)
model.to(device).train()

optimizer = ScheduledOptim(
    torch.optim.Adam(filter(lambda x:x.requires_grad, model.parameters()),
                     betas=(0.9, 0.98), eps=1e-09), opt.lr)

loss_fn = nn.CrossEntropyLoss()
print('[Info] Built a model with {} parameters'.format(
       sum(p.numel() for p in model.parameters())))
print('[Info]', opt)

[Info] Built a model with 10412402 parameters
[Info] Namespace(lr=0.001, dataset='em', embed_dim=300, seed=42, min_count=0, dropout=0.5, max_len=50, log_step=100, eval_step=1000, batch_size=32, epoch=50)


In [22]:
def main():
    
    c_path='/kaggle/working/checkpoints/t5_textcnn_{}.chkpt'.format(opt.dataset)
    output_dir = os.path.dirname(c_path)
    os.makedirs(output_dir, exist_ok=True)
    
    # Write to the file
    with open(c_path, 'a') as file:
        pass

    tab = 0
    avg_acc = 0
    total_acc = 0.
    total_num = 0.
    total_loss = 0.
    start = time.time()
    for e in range(opt.epoch):
        model.train()
        for idx, batch in enumerate(train_loader):
            x_batch, y_batch = map(lambda x: x.to(device), batch)
            optimizer.zero_grad()
            logits = model(x_batch)
            loss = loss_fn(logits, y_batch)
            total_loss += loss
            loss.backward()
            optimizer.step()

            y_hat = logits.argmax(dim=-1)
            same = [float(p == q) for p, q in zip(y_batch, y_hat)]
            total_acc += sum(same)
            total_num += len(y_batch)

            if optimizer.steps % opt.log_step == 0:
                lr = optimizer._optimizer.param_groups[0]['lr']
#                 print('[Info] Epoch {:02d}-{:05d}: | average acc {:.4f}% | '
#                     'average loss {:.4f} | lr {:.6f} | second {:.2f}'.format(
#                     e, optimizer.steps, total_acc / total_num * 100,
#                     total_loss / (total_num), lr, time.time() - start))
                start = time.time()

            if optimizer.steps % opt.eval_step == 0:
                valid_acc, valid_loss = evaluate_sc(model, valid_loader, loss_fn, e)
                if avg_acc < valid_acc:
                    avg_acc = valid_acc
                    save_path = '/kaggle/working/checkpoints/t5_textcnn_{}.chkpt'.format(opt.dataset)
                    torch.save(model.state_dict(), save_path)
                    print('[Info] The checkpoint file has been updated.')
                    tab = 0
                else:
                    tab += 1
                    if tab == 10:
                        break

if __name__ == '__main__':
    main()

[Info] Epoch 00-valid: acc 90.9230% | loss 0.0066
[Info] The checkpoint file has been updated.
[Info] Epoch 00-valid: acc 91.6300% | loss 0.0061
[Info] The checkpoint file has been updated.
[Info] Epoch 00-valid: acc 92.4709% | loss 0.0058
[Info] The checkpoint file has been updated.
[Info] Epoch 01-valid: acc 92.6046% | loss 0.0057
[Info] The checkpoint file has been updated.
[Info] Epoch 01-valid: acc 90.7701% | loss 0.0070
[Info] Epoch 01-valid: acc 92.4326% | loss 0.0056
[Info] Epoch 02-valid: acc 92.7957% | loss 0.0058
[Info] The checkpoint file has been updated.
[Info] Epoch 02-valid: acc 92.5664% | loss 0.0057
[Info] Epoch 02-valid: acc 92.0696% | loss 0.0061
[Info] Epoch 03-valid: acc 92.3753% | loss 0.0063
[Info] Epoch 03-valid: acc 93.1779% | loss 0.0054
[Info] The checkpoint file has been updated.
[Info] Epoch 03-valid: acc 92.6811% | loss 0.0059
[Info] Epoch 03-valid: acc 92.9104% | loss 0.0057
[Info] Epoch 04-valid: acc 93.1015% | loss 0.0060
[Info] Epoch 04-valid: acc 92.

In [23]:
#for testing the text cnn classifier 
filter_sizes = [1, 2, 3, 4, 5]
num_filters = [128, 128, 128, 128, 128]
device = 'cuda' if cuda.is_available() else 'cpu'
special_tokens = [{'bos_token': '<bos>'},
                  {'eos_token': '<eos>'}, {'sep_token': '<sep>'},
                  {'pad_token': '<pad>'}, {'unk_token': '<unk>'}]
tokenizer = T5Tokenizer.from_pretrained('t5-base')
def main():
    parser = argparse.ArgumentParser('Evaluating Style Strength')
    parser.add_argument('-order', default=0, type=str, help='order')
    parser.add_argument('-style', default=0, type=int, help='from 0 to 1')
    parser.add_argument('-max_len', default=50, type=int, help='max tokens in a batch')
    parser.add_argument('-embed_dim', default=300, type=int, help='the embedding size')
    parser.add_argument('-dataset', default='em', type=str, help='the name of dataset')
    parser.add_argument('-model', default='textcnn', type=str, help='the name of model')
    parser.add_argument('-seed', default=42, type=int, help='pseudo random number seed')
    parser.add_argument('-batch_size', default=32, type=int, help='max sents in a batch')
    parser.add_argument("-dropout", default=0.5, type=float, help="Keep prob in dropout")
    parser.add_argument("-file",default="/kaggle/input/dataproject/data/fr/test/formal", type=str)
    opt = parser.parse_args([])
    torch.manual_seed(opt.seed)

    test_src, test_tgt = [], []

    if opt.file is not None:
        with open(opt.file, 'r') as f:
            for line in f.readlines():
                test_tgt.append(tokenizer.encode(line.strip())[:opt.max_len])

    print('[Info] {} instances from src test set'.format(len(test_src)))
    print('[Info] {} instances from tgt test set'.format(len(test_tgt)))
    test_loader = SCIterator(test_src, test_tgt, opt, tokenizer.pad_token_id, shuffle=False)

    loss_fn = nn.CrossEntropyLoss()
    model = TextCNN(opt.embed_dim, len(tokenizer), filter_sizes,
                    num_filters, None, dropout=opt.dropout)
    model.to(device).eval()
    model.load_state_dict(torch.load('/kaggle/working/checkpoints/t5_textcnn_{}.chkpt'.format(
        opt.dataset)))

    total_num = 0.
    total_acc = 0.
    total_loss = 0.
    with torch.no_grad():
        for i,batch in enumerate(test_loader):
            x_batch, y_batch = map(lambda x: x.to(device), batch)
            logits = model(x_batch)
            F.softmax(logits, dim=-1)
            total_loss += loss_fn(logits, y_batch)
            _, y_hat = torch.max(logits,dim=-1)
            same = [float(p == q) for p, q in zip(y_batch, y_hat)]
            total_acc += sum(same)
            total_num += len(y_batch)
#             print(i,"done")

    print('Test: {}'.format('acc {:.4f}% | loss {:.4f}').format(
        (total_acc / total_num) * 100, total_loss / total_num))


if __name__ == '__main__':
    main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[Info] 0 instances from src test set
[Info] 1019 instances from tgt test set
Test: acc 96.6634% | loss 0.0028


In [24]:
def optimize(opt, loss, retain_graph=False):
    opt.zero_grad()
    loss.backward(retain_graph=retain_graph)
    opt.step()

In [25]:
class T5Dataset(torch.utils.data.Dataset):

    def __init__(self, src_file, tgt_file, tokenizer, max_len):

        with open(src_file, 'r') as f1, open(tgt_file,'r') as f2:
            self.src = f1.readlines()
            self.tgt = f2.readlines()
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):
        ctext = self.src[index].strip()
        ctext = ' '.join(ctext.split())

        text = self.tgt[index].strip()
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [48]:
class T5UnsupDataset(torch.utils.data.Dataset):

    def __init__(self, src_file, aug_file, tokenizer, max_len):

        with open(src_file, 'r') as f1, open(aug_file,'r') as f2:
            self.src = f1.readlines()
            self.aug = f2.readlines()
        self.max_len = max_len
        self.tokenizer = tokenizer


    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):
        ctext = self.src[index].strip()
        ctext = ' '.join(ctext.split())

        text = self.aug[index].strip()
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')
        augment = self.tokenizer.batch_encode_plus([text], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        augment_ids = augment['input_ids'].squeeze()
        augment_mask = augment['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'augment_ids': augment_ids.to(dtype=torch.long),
            'augment_mask': augment_mask.to(dtype=torch.long)
        }

In [27]:
class T5AugDataset(torch.utils.data.Dataset):

    def __init__(self, src_file, augmentor, tokenizer, max_len, aug_p=0.1, dataset='em'):

        with open(src_file, 'r') as f1:
            self.src = f1.readlines()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.augmentor = augment_choice(augmentor, aug_p, dataset=dataset)
        if self.augmentor is None:
            self.aug = False
        else:
            self.aug = True
        if isinstance(self.augmentor, list):
            self.mix = True
        else:
            self.mix = False


    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):
        ctext = self.src[index].strip()
        ctext = ' '.join(ctext.split())

        if self.aug:
            if self.mix:
                augmentor = random.choice(self.augmentor)
            else:
                augmentor = self.augmentor

            text = augmentor.augment(ctext)
        else:
            text = ctext

        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')
        augment = self.tokenizer.batch_encode_plus([text], max_length= self.max_len, padding='max_length',truncation=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        augment_ids = augment['input_ids'].squeeze()
        augment_mask = augment['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'augment_ids': augment_ids.to(dtype=torch.long),
            'augment_mask': augment_mask.to(dtype=torch.long)
        }

In [28]:
from nltk.translate.bleu_score import corpus_bleu,sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import nltk
def bleu(reference_files_src_list,gen_file_src,ngrams=4,ignore_case=False):
    all_reference=[]
    for src in reference_files_src_list:
        with open(src,'r',encoding='utf-8') as f:
            one_reference=[]
            for line in f:
                if not ignore_case:
                    one_reference.append(nltk.word_tokenize(line.strip()))
                else:
                    one_reference.append(nltk.word_tokenize(line.strip().lower()))
            all_reference.append(one_reference)
    all_reference=[[all_reference[i][j] for i in range(0,len(all_reference))] for j in range(0,len(all_reference[0]))]
    gen=[]
    with open(gen_file_src,'r',encoding='utf-8') as f:
        for line in f:
            if not ignore_case:
                gen.append(nltk.word_tokenize(line.strip()))
            else:
                gen.append(nltk.word_tokenize(line.strip().lower()))
    weight=[1.0/ngrams]*ngrams
    # print(len(gen))
    b=corpus_bleu(all_reference,gen,weights=weight)
    return b


def get_ref_src_list(path_prefix,ref_num=4):
    src_list=[]
    for i in range(0,ref_num):
        src_list.append(path_prefix+str(i))
    return src_list


def evaluate_bleu(ref_path, pred_path):
    # def eval_factory(log_dict,re):
    re = [ref_path + ".ref{}".format(t) for t in range(4)]

    return bleu(re, pred_path)


In [29]:
# -*- coding: utf-8 -*-

import os
import time
import argparse
import numpy as np
import nlpaug.augmenter.word as naw
import nltk
import random


import torch
from torch import cuda

# from utils.T5_dataset import T5Dataset
from torch.utils.data import DataLoader
# from utils.dataset import  SCIterator
# from utils.nltk_bleu import evaluate_bleu


device = 'cuda' if cuda.is_available() else 'cpu'

filter_sizes = [1, 2, 3, 4, 5]
num_filters = [128, 128, 128, 128, 128]

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    torch.cuda.manual_seed_all(seed)


def test(model, tokenizer, cls, cls_tokenizer, opt):

    styles = ['informal', 'formal']

    test_src_file = '/kaggle/input/dataproject/data/{}/{}/{}'.format(opt.dataset, 'test', styles[opt.style])

    test_tgt_file = '/kaggle/input/dataproject/data/{}/{}/{}.ref0'.format(opt.dataset, 'test', styles[1 - opt.style])

    test_label_files = f'/kaggle/input/dataproject/data/{opt.dataset}/test/{styles[1 - opt.style]}'

    test_dataset = T5Dataset(test_src_file, test_tgt_file, tokenizer, opt.max_len)
    test_loader = DataLoader(test_dataset,
                             num_workers=2,
                             batch_size=opt.val_batch_size,
                             shuffle=False)

    print('[Info] {} instances from test set'.format(len(test_dataset)))

    print("Test starts...")

    model.eval()


    start = time.time()
    pred_list = []

    if not os.path.exists(f'./data/{opt.dataset}/outputs/{opt.model}/'):
        os.mkdir(f'./data/{opt.dataset}/outputs/{opt.model}/')
    with open('./data/{}/outputs/{}/{}_{}_{}.{}_best_test.txt'.format(opt.dataset, opt.model,
                                                                      opt.model, opt.dataset, opt.order, opt.style),
              'w') as fout:
        for idx, data in enumerate(test_loader):
            if idx % 10 == 0:
                print('[Info] processing {} batches | seconds {:.4f}'.format(
                    idx, time.time() - start))
                start = time.time()

            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(ids,
                                           attention_mask=mask,
                                           num_beams=5,
                                           max_length=30)

            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
                     generated_ids]

            pred_list.extend(preds)
        for text in pred_list:
            fout.write(text.strip() + '\n')

    model.train()

    pred_file = './data/{}/outputs/{}/{}_{}_{}.{}_best_test.txt'.format(opt.dataset, opt.model,
                                                                        opt.model, opt.dataset, opt.order, opt.style)

    bleu = evaluate_bleu(test_label_files, pred_file)
    print(bleu)

    test_tgt = []
    test_src = []
    with open(pred_file, 'r') as f:
        for line in f.readlines():
            if opt.style == 0:
                test_tgt.append(cls_tokenizer.encode(line.strip())[:opt.max_len])
            else:
                test_src.append(cls_tokenizer.encode(line.strip())[:opt.max_len])
    cls_loader = SCIterator(test_src, test_tgt, opt, cls_tokenizer.pad_token_id)
    cls_loss_fn = torch.nn.CrossEntropyLoss()

    total_num = 0.
    total_acc = 0.
    total_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(cls_loader):
            x_batch, y_batch = map(lambda x: x.to(device), batch)
            logits = cls(x_batch)
            # print(F.softmax(logits, dim=-1))
            total_loss += cls_loss_fn(logits, y_batch)
            _, y_hat = torch.max(logits, dim=-1)
            same = [float(p == q) for p, q in zip(y_batch, y_hat)]
            total_acc += sum(same)
            total_num += len(y_batch)

    print('Test: {}'.format('acc {:.4f}% | loss {:.4f}').format(
        total_acc / total_num * 100, total_loss / total_num))

    with open('./data/{}/outputs/{}/{}_{}_{}_bleu_test.txt'.format(opt.dataset, opt.model,
                                                                   opt.model, opt.dataset, opt.order, opt.style),
              'a') as fbl:
        fbl.write(
            'Test Bleu score for model {}: {:.4f};  Acc: {:.4f}\n'.format(opt.order, bleu, total_acc / total_num * 100))

    return bleu, total_acc / total_num * 100, total_loss / total_num

In [30]:
import torch
torch.cuda.empty_cache()
torch.cuda.empty_cache()

In [31]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer
import os
import time
import argparse
import random, re, math



import torch
from torch import cuda
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader

from nltk.translate.bleu_score import sentence_bleu
from pyskiplist import SkipList
import fitlog
import kenlm
import statistics
from torch.nn import CrossEntropyLoss

In [32]:


device = 'cuda' if cuda.is_available() else 'cpu'

# parameters for textCNN
filter_sizes = [1, 2, 3, 4, 5]
num_filters = [128, 128, 128, 128, 128]

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def prepare_batch_cls(insts, pad_token_id=1):
    ''' Pad the instance to the max seq length in batch for CNN classifier '''

    max_len = max(len(inst) for inst in insts)
    max_len = max_len if max_len > 4 else 5

    batch_seq = np.array([
        inst + [pad_token_id] * (max_len - len(inst))
        for inst in insts])
    batch_seq = torch.LongTensor(batch_seq)

    return batch_seq

def score_generated_sentences(generated_text_file_path, model):
    #get perplexity scores for evaluation
    log_probs = list()
    perplexity_scores = list()

    with open(generated_text_file_path) as generated_text_file:
        for sentence in generated_text_file:
            cleaned_sentence = clean_text(sentence)
            log_probs.append(model.score(cleaned_sentence))
            perplexity_scores.append(model.perplexity(cleaned_sentence))

    return statistics.mean(log_probs), statistics.mean(perplexity_scores)


def clean_text(string):
    string = string.replace(".", "")
    string = string.replace(".", "")
    string = string.replace("\n", " ")
    string = string.replace(" 's", " is")
    string = string.replace("'m", " am")
    string = string.replace("'ve", " have")
    string = string.replace("n't", " not")
    string = string.replace("'re", " are")
    string = string.replace("'d", " would")
    string = string.replace("'ll", " will")
    string = string.replace("\r", " ")
    string = string.replace("\n", " ")
    string = re.sub(r'\d+', "number", string)
    string = ''.join(x for x in string if x.isalnum() or x == " ")
    string = re.sub(r'\s{2,}', " ", string)
    string = string.strip().lower()

    return string


def score_sentence(sentences, model):
    # log_probs = list()
    perplexity_scores = list()

    for sentence in sentences:
        cleaned_sentence = clean_text(sentence)
        # log_probs.append(model.score(cleaned_sentence))
        perplexity_scores.append(model.perplexity(cleaned_sentence))

    return perplexity_scores

def sentence_bleu_score(sents, refs, ngrams=3):
    sents = [nltk.word_tokenize(sent) for sent in sents]
    refs = [nltk.word_tokenize(ref) for ref in refs]
    weight=[1.0 / ngrams] * ngrams
    scores = []
    for sent, ref in zip(sents, refs):
        scores.append(sentence_bleu([ref], sent, weights=weight))
    return scores
    


In [33]:
# -style 0 \
#     -dataset em \
#     -order em.sup \
#     -batch_size 8 \
#     -val_batch_size 16 \
#     -lr 2e-5
# -unsup \
#     -style 0 \
#     -ratio 1.0 \
#     -dataset em \
#     -order em-semi \
#     -pre_step 2000 \
#     -batch_size 8 \
#     -unsup_batch_size 56 \
#     -val_batch_size 16 \
#     -lr 2e-5  \
#     -aug_choice spell \
#     -aug_p 0.1  \
#     -filter cls \
#     -phi 0.4  \
#     -n_step 1000 \
#     -sigma 0.8
parser = argparse.ArgumentParser('Fine-Tuned T5 for style transfer')
parser.add_argument('-order', default="em-semi", type=str, help='the order of traing')
parser.add_argument('-style', default=0, type=int, help='transfer inf. to for.')
parser.add_argument('-lr', default=2e-5, type=float, help='the learning rate')
parser.add_argument('-ratio', default=1., type=float, help='proportion of data')
parser.add_argument('-model', default='t5', type=str, help='the name of model')
parser.add_argument('-model_name', default='t5-large', type=str, help='the name of model')
parser.add_argument('-dataset', default='em', type=str, help='the name of dataset')
parser.add_argument('-steps', default=30000, type=int, help='force stop at x steps')
parser.add_argument('-batch_size', default=8, type=int, help='the size in a batch')
parser.add_argument('-val_batch_size', default=16, type=int, help='the size in a batch')
parser.add_argument('-max_len', default=50, type=int, help='maximum tokens a batch')
parser.add_argument('-dropout', default=0.5, type=float, help='Keep prob in dropout')
parser.add_argument('-patience', default=10, type=int, help='early stopping fine-tune')
parser.add_argument('-seed', default=42, type=int, help='pseudo random generator seed')
parser.add_argument('-log_step', default=100, type=int, help='print logs every x step')
parser.add_argument('-eval_step', default=1000, type=int, help='evaluate every x step')
parser.add_argument('-unsup', action='store_true', help='use unsupervised loss')
parser.add_argument('-unsup_batch_size', default=56, type=int, help='batch size for unlabeled data')
parser.add_argument('-weight', default=1.0, type=float, help='balance weight of unsup loss')
parser.add_argument('-pre_step', default=2000, type=int, help='pretrain steps')
parser.add_argument('-aug_type', default='real-time',type=str, help="whether to augment sentences while training")
parser.add_argument('-aug_choice', default='spell', type=str)
parser.add_argument('-aug_p', default=0.1, type=float, help='augmentation probability')
parser.add_argument('-filter', default='cls', type=str, help='metric used for data filtering')
parser.add_argument('-phi', default=0.4, type=float, help='threshold for formality evaluation (cls filter)')
parser.add_argument('-n_step', default=1000, type=int, help='number of steps for computing initial score list')
parser.add_argument('-sigma', default=0.8, type=float, help='dynamic threshold for lm/bleu filtering')
parser.add_argument('-ngrams', default=4, type=int)
parser.add_argument('-log_dir', default='./logs', type=str, help='directory of logs')


opt = parser.parse_args()
print('[Info]', opt)
opt.unsup=False
set_seed(opt.seed)
# fitlog.debug() disenables fitlog, comment it if you want to use fitlog
fitlog.debug()
fitlog.set_log_dir(opt.log_dir)
fitlog.add_hyper(opt)
fitlog.add_other({"notebook_name": "Nlpproject_123.ipynb"})  # Manually specify notebook name
fitlog.add_other({"experiment_description": "Description of your experiment"})
path_now='./data/{}/outputs/{}/{}_{}_{}.{}_bleu.txt'.format(opt.dataset, opt.model,
                                                             opt.model, opt.dataset, opt.order, opt.style)
output_dir = os.path.dirname(path_now)
os.makedirs(output_dir, exist_ok=True)
with open(path_now, 'a') as file:
    pass
with open('./data/{}/outputs/{}/{}_{}_{}.{}_bleu.txt'.format(opt.dataset, opt.model,
                                                             opt.model, opt.dataset, opt.order, opt.style),
          'a') as fbl:
    fbl.write(str(opt) + '\n')

set_seed(opt.seed)
fitlog.debug()
fitlog.set_log_dir(opt.log_dir)
fitlog.add_hyper(opt)

[Info] Namespace(order='em-semi', style=0, lr=2e-05, ratio=1.0, model='t5', model_name='t5-large', dataset='em', steps=30000, batch_size=8, val_batch_size=16, max_len=50, dropout=0.5, patience=10, seed=42, log_step=100, eval_step=1000, unsup=False, unsup_batch_size=56, weight=1.0, pre_step=2000, aug_type='real-time', aug_choice='spell', aug_p=0.1, filter='/root/.local/share/jupyter/runtime/kernel-a8a35ee9-a7cc-4b9b-9b2a-6916491ed99a.json', phi=0.4, n_step=1000, sigma=0.8, ngrams=4, log_dir='./logs')


In [34]:
# to train the model
tokenizer = T5Tokenizer.from_pretrained(opt.model_name)
cls_tokenizer = tokenizer

model = T5ForConditionalGeneration.from_pretrained(opt.model_name)

model.to(device).train()

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [35]:
opt.unsup=True

In [36]:
# CNN classifier for evaluation
cls = TextCNN(300, len(cls_tokenizer), filter_sizes,num_filters, None, dropout=opt.dropout)
cls.to(device).eval()
cls.load_state_dict(torch.load('./checkpoints/t5_textcnn_{}.chkpt'.format(opt.dataset)))


styles = ['informal', 'formal']
if opt.style == 0:
    unsup_file = f"data/unlabeled/{opt.dataset.upper()}_200k_inf.txt"
    unsup_file = f"/kaggle/input/dataproject/data/unlabeled/informal-e.txt"
else:
    raise ValueError("Invalid style.")

train_src_file = '/kaggle/input/dataproject/data/{}/{}/{}'.format(opt.dataset, 'train', styles[opt.style])
train_tgt_file = '/kaggle/input/dataproject/data/{}/{}/{}'.format(opt.dataset, 'train', styles[1 - opt.style])
train_dataset = T5Dataset(train_src_file, train_tgt_file, tokenizer, opt.max_len)
train_loader = DataLoader(train_dataset,num_workers=2,batch_size=opt.batch_size,shuffle=True)

val_src_file = '/kaggle/input/dataproject/data/{}/{}/{}'.format(opt.dataset, 'tune', styles[opt.style])
val_tgt_file = '/kaggle/input/dataproject/data/{}/{}/{}.ref0'.format(opt.dataset, 'tune', styles[1 - opt.style])
val_label_files = f'/kaggle/input/dataproject/data/{opt.dataset}/tune/{styles[1 - opt.style]}'
val_dataset = T5Dataset(val_src_file, val_tgt_file, tokenizer, opt.max_len)
val_loader = DataLoader(val_dataset,
                        num_workers=2,
                        batch_size=opt.val_batch_size,
                        shuffle=False)

# test_src_file = 'data/{}/{}/{}'.format(opt.dataset, 'test', styles[opt.style])
# test_tgt_file = 'data/{}/{}/{}.ref0'.format(opt.dataset, 'test', styles[1 - opt.style])
# test_label_files = f'data/{opt.dataset}/test/{styles[1 - opt.style]}'


print('[Info] {} instances from train set'.format(len(train_dataset)))
print('[Info] {} instances from validation set'.format(len(val_dataset)))

[Info] 52595 instances from train set
[Info] 2877 instances from validation set


In [37]:
for idx,data in enumerate(train_dataset):
    if idx<1:
        print(data)
    else:
        break

{'source_ids': tensor([   8, 1974,   37,   86,   18, 3612,  210,    7,   59, 1776,    3,    9,
        2297, 1974,   68, 6613,   11,  207,   55,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]), 'source_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'target_ids': tensor([  37,   86,   18, 3612,  210,    7, 1974,   19,   29,   31,   17,    3,
           9, 2297, 1974,    6,   68,   34,   31,    7, 8957,    5,    1,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]), 'target_ids_y': tensor([  37,   86,   18, 3612,  210,    7, 1974,   19,   29,   31,   17,    3,


In [38]:
for idx,data in enumerate(train_loader):
    if idx<1:
        print(data)
    else:
        break

{'source_ids': tensor([[    3,    10,    61,   446,    87,   157,    27,    31,    26,   987,
            34,   541,    11,   278,    31,    17,   752,    34,   129,    48,
           625,     5,     2,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  216,   429,    36,     6,    27,  2483,   214,     5,     1,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [   66,     8,  3605,     3,    88,   143,     3,    52,   836,  6417,
             5,     1,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0

In [39]:
# pretrained lm model for lm filter
language_model_path = f'/kaggle/input/checkpoint/checkpoints/{opt.dataset}_{styles[1-opt.style]}.arpa'
lm_model = kenlm.Model(language_model_path)

Loading the LM will be faster if you build a binary file.
Reading /kaggle/input/checkpoint/checkpoints/em_formal.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [40]:
print(opt.unsup)

True


In [41]:
def augment_choice(arg, aug_p, dataset='em'):
    if arg == "drop":
        return naw.random.RandomWordAug(action='delete', aug_p=aug_p)
    elif arg == 'swap':
        return naw.random.RandomWordAug(action='swap', aug_p=aug_p)
    elif arg == 'unk':
        return naw.random.RandomWordAug(action='substitute', aug_p=aug_p, target_words=['_'])
    elif arg == 'synonym':
        return naw.SynonymAug(aug_src='wordnet', aug_p=aug_p)
    elif arg == 'spell':
        # If using official Nlpaug, randomness will happen even with fixed random seeds.
        return naw.SpellingAug(aug_p=aug_p)
    elif arg == 'keyboard':
        return nac.KeyboardAug(aug_word_p=aug_p)
    elif arg == 'tfidf':
        return naw.TfIdfAug(model_path=f'./data/unlabeled/{dataset}tfidf/', tokenizer=_tokenizer, aug_p=aug_p)
    elif arg == 'capital':
        return CapitalizeAug(aug_p=aug_p)
    elif arg == 'abbr':
        return AbbrAug(aug_p=aug_p)
    elif arg == 'repeatchar':
        return RepeatCharAug(aug_word_p=aug_p)
    elif arg == 'none':
        return None
    else:
        raise ValueError("Unsupported augmentor!")


In [42]:
#for unsupervised data aug 
if opt.unsup:
        #'real-time' means augmenting the texts on-the-fly
        if opt.aug_type == 'real-time':
            aug = opt.aug_choice
            unlabeled_dataset = T5AugDataset(
                src_file=unsup_file,
                augmentor=aug,
                max_len=opt.max_len,
                tokenizer=tokenizer,
                aug_p=opt.aug_p,
                dataset=opt.dataset
            )
        else:
            # Otherwise, augment all the texts beforehand
            unlabeled_dataset = T5UnsupDataset(
                src_file=f"data/unlabeled/{opt.dataset.upper()}_200k_inf.txt",
                aug_file=f"data/unlabeled/{opt.dataset.upper()}_200k_inf_{opt.aug_choice}.txt",
                max_len=opt.max_len,
                tokenizer=tokenizer)

        unsup_loader = DataLoader(unlabeled_dataset,
                              num_workers=10,
                              batch_size=opt.unsup_batch_size,
                              shuffle=True)
        print('[Info] {} instances from unlabeled set'.format(len(unlabeled_dataset)))

[Info] 200000 instances from unlabeled set




In [43]:
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = ScheduledOptim(torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),betas=(0.9, 0.98), eps=1e-09), opt.lr, 10000)

In [2]:
def main():
    tab = 0
    eval_loss = 1e8
    total_loss_ce = []
    total_loss_unsup = []
    best_bleu = 0.0
    best_acc = 0.0

    perp_list_all = SkipList()
    bleu_list_all = SkipList()

    #count filtered samples
    num_all = 0
    num_chosen = 0

    start = time.time()
    train_iter = iter(train_loader)
    if opt.unsup:
        unsup_iter = iter(unsup_loader)
    for step in range(1, opt.steps):
        try:
            data = next(train_iter)
        except:
            train_iter = iter(train_loader)
            data = next(train_iter)

        if opt.unsup and step > opt.pre_step:
            try:
                unsup_batch = next(unsup_iter)
            except:
                unsup_iter = iter(unsup_loader)
                unsup_batch = next(unsup_iter)

        # supervised loss
        lm_labels = data['target_ids'].to(device, dtype=torch.long)
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=lm_labels)
        loss_ce = outputs[0]
        total_loss_ce.append(loss_ce.item())

        # unsupervised loss
        if opt.unsup and step > opt.pre_step:
            unsup_ids = unsup_batch['source_ids'].to(device, dtype=torch.long)
            unsup_mask = unsup_batch['source_mask'].to(device, dtype=torch.long)
            aug_ids = unsup_batch['augment_ids'].to(device, dtype=torch.long)
            aug_mask = unsup_batch['augment_mask'].to(device, dtype=torch.long)
            model.eval()
            pseudo_labels = model.generate(unsup_ids,
                                           attention_mask=unsup_mask,
                                           num_beams=5,
                                           max_length=30)
            model.train()

            if opt.filter == 'lm':
                pseudo_targets = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for
                                  g in
                                  pseudo_labels]
                perplexities = score_sentence(pseudo_targets, lm_model)
                if num_all < len(unlabeled_dataset):
                    for score in perplexities:
                        perp_list_all.insert(score, None)
                    if step > opt.n_step + opt.pre_step:
                        idx = math.floor((1 - opt.sigma) * len(perp_list_all))
                        perp_threshold = perp_list_all[idx][0]
                    else:
                        perp_threshold = 10000
                else:
                    perp_threshold = perp_threshold


                y_mask = (torch.Tensor(perplexities) < perp_threshold).to(device, dtype=torch.float)
                num_all += opt.unsup_batch_size
                num_chosen += sum(y_mask)
                fitlog.add_metric(num_chosen, name="num_chosen", step=step)
                fitlog.add_metric(num_all, name="num_all", step=step)
                fitlog.add_metric(num_chosen / num_all, name="filter_ratio", step=step)
                fitlog.add_metric(perp_threshold, name="perp_threshold", step=step)

                if num_all >= len(unlabeled_dataset):
                    num_all = 0
                    num_chosen = 0
                    perp_list_all = SkipList()

                pseudo_labels[pseudo_labels[:, :] == tokenizer.pad_token_id] = -100
                unsup_output = model(aug_ids, attention_mask=aug_mask,
                                     labels=pseudo_labels)

                unsup_logits = unsup_output[1]
                pseudo_labels[pseudo_labels[:, :] == -100] = tokenizer.pad_token_id
                unsup_loss_ce = loss_fn(unsup_logits.view(-1, unsup_logits.size(-1)), pseudo_labels.view(-1))
                y_mask = y_mask.unsqueeze(1).repeat(1, pseudo_labels.size(-1)).view(-1)

                unsup_loss_ce = unsup_loss_ce * y_mask
                unsup_loss_ce = unsup_loss_ce.sum()
                if unsup_loss_ce > 0:
                    unsup_loss_ce /= y_mask.sum()
                total_loss_unsup.append(unsup_loss_ce.item())

            if opt.filter == 'bleu':
                pseudo_targets = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for
                                  g in
                                  pseudo_labels]
                sources = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for
                                  g in
                                  unsup_ids]
                sent_bleus = sentence_bleu_score(pseudo_targets, sources, opt.ngrams)

                if num_all < len(unlabeled_dataset):
                    for score in sent_bleus:
                        bleu_list_all.insert(score, None)
                    if step > opt.n_step + opt.pre_step:

                        idx = math.floor(opt.sigma * len(bleu_list_all))
                        bleu_threshold = bleu_list_all[idx][0]

                    else:
                        bleu_threshold = -1
                else:
                    #meaningless, just for notes
                    bleu_threshold = bleu_threshold

                y_mask = (torch.Tensor(sent_bleus) > bleu_threshold).to(device, dtype=torch.float)
                num_all += opt.unsup_batch_size
                num_chosen += sum(y_mask)
                fitlog.add_metric(num_chosen, name="num_chosen", step=step)
                fitlog.add_metric(num_all, name="num_all", step=step)
                fitlog.add_metric(num_chosen / num_all, name="filter_ratio", step=step)
                fitlog.add_metric(bleu_threshold, name="bleu_threshold",step=step)

                if num_all >= len(unlabeled_dataset):
                    num_all = 0
                    num_chosen = 0
                    bleu_list_all = SkipList()

                pseudo_labels[pseudo_labels[:, :] == tokenizer.pad_token_id] = -100
                unsup_output = model(aug_ids, attention_mask=aug_mask,
                                     labels=pseudo_labels)
                # unsup_loss_ce = unsup_output[0]
                unsup_logits = unsup_output[1]
                pseudo_labels[pseudo_labels[:, :] == -100] = tokenizer.pad_token_id
                unsup_loss_ce = loss_fn(unsup_logits.view(-1, unsup_logits.size(-1)), pseudo_labels.view(-1))
                y_mask = y_mask.unsqueeze(1).repeat(1, pseudo_labels.size(-1)).view(-1)

                unsup_loss_ce = unsup_loss_ce * y_mask
                unsup_loss_ce = unsup_loss_ce.sum()
                if unsup_loss_ce > 0:
                    unsup_loss_ce /= y_mask.sum()
                total_loss_unsup.append(unsup_loss_ce.item())

            if opt.filter == "none":
                pseudo_labels[pseudo_labels[:, :] == tokenizer.pad_token_id] = -100
                unsup_output = model(aug_ids, attention_mask=aug_mask,
                                     labels=pseudo_labels)
                unsup_loss_ce = unsup_output[0]
                total_loss_unsup.append(unsup_loss_ce.item())

            if opt.filter == "cls":
                pseudo_targets = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for
                                  g in
                                  pseudo_labels]
                pseudo_tgt = [cls_tokenizer.encode(line.strip())[:opt.max_len] for line in pseudo_targets]
                pseudo_tgt = prepare_batch_cls(pseudo_tgt, pad_token_id=cls_tokenizer.pad_token_id).to(device)
                logits_tgt = F.softmax(cls(pseudo_tgt), dim=-1)
                pseudo_source = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g
                                 in
                                 unsup_ids]
                pseudo_src = [cls_tokenizer.encode(line.strip())[:opt.max_len] for line in pseudo_source]
                pseudo_src = prepare_batch_cls(pseudo_src, pad_token_id=cls_tokenizer.pad_token_id).to(device)
                logits_src = F.softmax(cls(pseudo_src), dim=-1)
                y_mask = (logits_tgt[:, 1-opt.style] - logits_src[:, 1-opt.style] > opt.phi).float()

                num_all += opt.unsup_batch_size
                num_chosen += sum(y_mask)
                fitlog.add_metric(num_chosen, name="num_chosen", step=step)
                fitlog.add_metric(num_all, name="num_all", step=step)
                fitlog.add_metric(num_chosen / num_all, name="filter_ratio", step=step)

                if num_all >= len(unlabeled_dataset):
                    num_all = 0
                    num_chosen = 0

                pseudo_labels[pseudo_labels[:, :] == tokenizer.pad_token_id] = -100
                unsup_output = model(aug_ids, attention_mask=aug_mask,
                                     labels=pseudo_labels)
                unsup_logits = unsup_output[1]
                pseudo_labels[pseudo_labels[:, :] == -100] = tokenizer.pad_token_id
                unsup_loss_ce = loss_fn(unsup_logits.view(-1, unsup_logits.size(-1)), pseudo_labels.view(-1))
                y_mask = y_mask.unsqueeze(1).repeat(1, pseudo_labels.size(-1)).view(-1)

                unsup_loss_ce = unsup_loss_ce * y_mask
                unsup_loss_ce = unsup_loss_ce.sum()
                if unsup_loss_ce > 0:
                    unsup_loss_ce /= y_mask.sum()

                total_loss_unsup.append(unsup_loss_ce.item())

        if opt.unsup and step > opt.pre_step:
            optimize(optimizer, loss_ce + opt.weight * unsup_loss_ce)
        else:
            optimize(optimizer, loss_ce)

        if step % opt.log_step == 0:
            lr = optimizer._optimizer.param_groups[0]['lr']
            if opt.unsup and step > opt.pre_step:
                print('[Info] steps {:05d} | loss_sup {:.4f} | '
                      'loss_unsup {:.4f} | lr {:.6f} | second {:.2f}'.format(
                    step, np.mean(total_loss_ce), np.mean(total_loss_unsup)
                    , lr, time.time() - start))
                fitlog.add_loss(np.mean(total_loss_ce), name="Sup-loss", step=step)
                fitlog.add_loss(np.mean(total_loss_unsup), name="Unsup-loss", step=step)
            else:
                print('[Info] steps {:05d} | loss_ce {:.4f} | lr {:.6f} | second {:.2f}'.format(
                    step, np.mean(total_loss_ce), lr, time.time() - start))
                fitlog.add_loss(np.mean(total_loss_ce), name="Sup-loss", step=step)

            total_loss_ce = []
            total_loss_unsup = []
            start = time.time()

        if ((len(train_loader) > opt.eval_step
             and step % opt.eval_step == 0)
                or (len(train_loader) < opt.eval_step
                    and step % len(train_loader) == 0)):

            print("validation starts...")
            # if eval_loss >= valid_loss:
            model.eval()

            start = time.time()
            pred_list = []

            if not os.path.exists(f'./data/{opt.dataset}/outputs/{opt.model}/'):
                os.mkdir(f'./data/{opt.dataset}/outputs/{opt.model}/')
            with open('./data/{}/outputs/{}/{}_{}_{}.{}_step{}.txt'.format(opt.dataset, opt.model,
                    opt.model, opt.dataset, opt.order, opt.style, step), 'w') as fout:
                for idx, data in enumerate(val_loader):
                    if idx % 10 == 0:
                        print('[Info] processing {} batches | seconds {:.4f}'.format(
                            idx, time.time() - start))
                        start = time.time()

                    ids = data['source_ids'].to(device, dtype=torch.long)
                    mask = data['source_mask'].to(device, dtype=torch.long)
                    generated_ids = model.generate(ids,
                                                   attention_mask=mask,
                                                   num_beams=5,
                                                   max_length=30)
                    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
                             generated_ids]
                    pred_list.extend(preds)

                for text in pred_list:
                    fout.write(text.strip() + '\n')

            model.train()

            pred_file = './data/{}/outputs/{}/{}_{}_{}.{}_step{}.txt'.format(opt.dataset, opt.model,
                    opt.model, opt.dataset, opt.order, opt.style, step)

            bleu = evaluate_bleu(val_label_files, pred_file)
            print(bleu)
            fitlog.add_metric(bleu, name="BLEU", step=step)
            _, perplexity = score_generated_sentences(pred_file, lm_model)
            fitlog.add_metric(perplexity, name="perplexity", step=step)

            if bleu > best_bleu:
                tab = 0
                best_bleu = bleu
                torch.save(model.state_dict(), 'checkpoints/{}_{}_{}_{}.chkpt'.format(
                    opt.model, opt.dataset, opt.order, opt.style))
                print('[Info] The checkpoint file has been updated.')
                fitlog.add_best_metric({"dev":{"BLEU":best_bleu}})
                test_bleu, test_acc, test_loss = test(model, tokenizer, cls, cls_tokenizer, opt)
                test_hm = 2.0 / (1.0 / test_bleu + 100.0 / test_acc)
                fitlog.add_loss({"test":{"Loss":test_loss}}, step=step)

                test_file = './data/{}/outputs/{}/{}_{}_{}.{}_best_test.txt'.format(opt.dataset, opt.model,
                                                                        opt.model, opt.dataset, opt.order, opt.style)
                _, test_perplexity = score_generated_sentences(test_file, lm_model)
                fitlog.add_best_metric({"test": {"BLEU": test_bleu, "Acc": test_acc, "HM": test_hm, "perplexity": test_perplexity}})
            else:
                tab += 1
            if tab == opt.patience:
                #early stopping
                exit()

            # Evaluate style accuracy
            test_tgt = []
            test_src = []
            with open(pred_file, 'r') as f:
                for line in f.readlines():
                    if opt.style == 0:
                        test_tgt.append(cls_tokenizer.encode(line.strip())[:opt.max_len])
                    else:
                        test_src.append(cls_tokenizer.encode(line.strip())[:opt.max_len])
            cls_loader = SCIterator(test_src, test_tgt, opt, cls_tokenizer.pad_token_id)
            cls_loss_fn = torch.nn.CrossEntropyLoss()

            total_num = 0.
            total_acc = 0.
            total_loss = 0.
            with torch.no_grad():
                for i, batch in enumerate(cls_loader):
                    x_batch, y_batch = map(lambda x: x.to(device), batch)
                    logits = cls(x_batch)
                    # print(F.softmax(logits, dim=-1))
                    total_loss += cls_loss_fn(logits, y_batch)
                    _, y_hat = torch.max(logits, dim=-1)
                    same = [float(p == q) for p, q in zip(y_batch, y_hat)]
                    total_acc += sum(same)
                    total_num += len(y_batch)

            print('Test: {}'.format('acc {:.4f}% | loss {:.4f}').format(
                total_acc / total_num * 100, total_loss / total_num))

            with open('./data/{}/outputs/{}/{}_{}_{}.{}_bleu.txt'.format(opt.dataset, opt.model,
                    opt.model, opt.dataset, opt.order, opt.style), 'a') as fbl:

                fbl.write('Bleu score at step {}: {:.4f};  Acc: {:.4f}\n'.format(step, bleu, total_acc / total_num * 100))
            acc = total_acc / total_num * 100
            fitlog.add_metric(acc, name="Acc", step=step)
            if bleu == best_bleu:
                best_acc = acc
                fitlog.add_best_metric({"dev": {"Acc": best_acc}})

fitlog.finish()

if __name__ == "__main__":
    main()

NameError: name 'fitlog' is not defined