In [50]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.2.10 daaja



In [73]:
import random
import pandas as pd
import torch
from torch.utils.data import DataLoader
from daaja.eda import EasyDataAugmentor
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

In [23]:
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz 
!tar -zxf ldcc-20140209.tar.gz 

--2022-02-13 04:46:31--  https://www.rondhuit.com/download/ldcc-20140209.tar.gz
Resolving www.rondhuit.com (www.rondhuit.com)... 59.106.19.174
Connecting to www.rondhuit.com (www.rondhuit.com)|59.106.19.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8855190 (8.4M) [application/x-gzip]
Saving to: ‘ldcc-20140209.tar.gz.1’


2022-02-13 04:46:35 (2.41 MB/s) - ‘ldcc-20140209.tar.gz.1’ saved [8855190/8855190]



In [26]:
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

In [27]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [20]:
augmentor = EasyDataAugmentor(alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=4)

['日本語でデータ拡張を行う', '伸す日本語でデータ拡張を行う', '日本語で資料拡張を行う', '日本語でデータ拡張を行う', '日本語でデータ拡張を行う']


In [48]:
from collections import defaultdict
category_list = [
    'dokujo-tsushin',
    'it-life-hack',
    'kaden-channel',
    'livedoor-homme',
    'movie-enter',
    'peachy',
    'smax',
    'sports-watch',
    'topic-news'
]

datasets = []
for label, category in enumerate(tqdm(category_list)):
    for file in glob.glob(f'./text/{category}/{category}*'):
        lines = open(file).read().splitlines()
        text = '\n'.join(lines[3:])
        datasets.append([label, text])


100%|██████████| 9/9 [00:00<00:00, 10.61it/s]


In [51]:
random.shuffle(datasets)
n = len(datasets)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = datasets[:n_train]
dataset_val = datasets[n_train:n_train+n_val]
dataset_test = datasets[n_train+n_val:]

In [57]:
class BertForSequenceClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters() 

        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
        
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = ( labels_predicted == labels ).sum().item()
        accuracy = num_correct/labels.size(0)
        self.log('accuracy', accuracy)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [None]:
# Data Augment
dataset_subset = dataset_train[:500]
alpha, n = 0.1, 16
augmentor = EasyDataAugmentor(alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=n)

augment_dataset = []
for label, text in tqdm(dataset_subset):
    for aug_text in augmentor.augments(text):
        augment_dataset.append([label, aug_text])

In [88]:
print("All train dataset: {}".format(len(dataset_train)))
print("train dataset: {}".format(len(dataset_subset)))
print("augment train dataset: {}".format(len(augment_dataset)))

All train dataset: 4420
train dataset: 500
augment train dataset: 8500


In [89]:
def create_dataset_for_loader(dataset):
    dataset_for_loader = []
    for dataset_i in dataset:
        label, text = dataset_i
        encoding = tokenizer(
            text,
            max_length=max_length, 
            padding='max_length',
            truncation=True
        )
        encoding['labels'] = label
        encoding = { k: torch.tensor(v) for k, v in encoding.items() }
        dataset_for_loader.append(encoding)
    return dataset_for_loader

In [90]:
all_train_dataset_for_loader = create_dataset_for_loader(dataset_train)
subset_train_dataset_for_loader = create_dataset_for_loader(dataset_subset)
augment_train_dataset_for_loader = create_dataset_for_loader(augment_dataset)
val_dataset_for_loader = create_dataset_for_loader(dataset_val)
test_dataset_for_loader = create_dataset_for_loader(dataset_test)

In [91]:
dataloader_val = DataLoader(val_dataset_for_loader, batch_size=256)
dataloader_test = DataLoader(test_dataset_for_loader, batch_size=256)

In [92]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

trainer = pl.Trainer(
    gpus=1, 
    max_epochs=10,
    callbacks = [checkpoint]
)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores


In [93]:
# Subset
subset_dataloader_train = DataLoader(
    subset_train_dataset_for_loader, batch_size=32, shuffle=True
) 


model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=9, lr=1e-5
)

# Subset training
trainer.fit(model, subset_dataloader_train, dataloader_val) 

# Subset accuracy
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.7618724703788757}
--------------------------------------------------------------------------------
Accuracy: 0.76


In [94]:
# Data Augmentation
augment_dataloader_train = DataLoader(
    augment_train_dataset_for_loader, batch_size=32, shuffle=True
) 


model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=9, lr=1e-5
)

# Data Augmentation training
trainer.fit(model, augment_dataloader_train, dataloader_val) 

# Data Augmentation accuracy
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Validation sanity check: 0it [00:00, ?it/s]

Training: 15it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.7788330912590027}
--------------------------------------------------------------------------------
Accuracy: 0.78
