In [1]:
# !pip install transformers==4.18.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.6.1

import random
import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

# 日本語の事前学習モデル
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

submit = pd.read_csv("data/sample_submission.csv")

In [3]:
def make_dataset(df, max_length, tokenizer):
  dataset = []
  for index, row in df.iterrows():
    encoding = tokenizer(
        row['text'],
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    encoding['labels']=row['isFake']
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset.append(encoding)
  return dataset

In [4]:
max_length = 128
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

dataset_for_loader = make_dataset(train, max_length, tokenizer)
random.shuffle(dataset_for_loader)
n = len(dataset_for_loader)
n_train = int(0.7*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:]
# dataset_test = make_dataset(test, max_length, tokenizer)

dataloader_train = DataLoader(
    dataset_train, batch_size=1, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
# dataloader_test = DataLoader(dataset_test, batch_size=256)

In [5]:
class BertForSequenceClassification_pl(pl.LightningModule):
        
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters() 
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        self.outputs = []
        
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
        
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def test_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        self.outputs.extend(labels_predicted.tolist())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [6]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

trainer = pl.Trainer(
    gpus=1, 
    max_epochs=10,
    callbacks = [checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=2, lr=1e-5
)

trainer.fit(model, dataloader_train, dataloader_val) 

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|█████████▉| 2646/2651 [08:03<00:00,  5.48it/s, loss=0.437, v_num=1] 

RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 3.80 GiB total capacity; 2.03 GiB already allocated; 137.94 MiB free; 2.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
best_model_path = checkpoint.best_model_path
print('ベストモデルのファイル: ', checkpoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkpoint.best_model_score)

ベストモデルのファイル:  /content/drive/MyDrive/FakeNewsDetection/model/epoch=7-step=664.ckpt
ベストモデルの検証データに対する損失:  tensor(0.0718, device='cuda:0')


In [None]:
def make_dataset_test(df, max_length, tokenizer):
  dataset = []
  for index, row in df.iterrows():
    encoding = tokenizer(
        row['text'],
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset.append(encoding)
  return dataset

In [None]:
dataset_test = make_dataset_test(test, max_length, tokenizer)
dataloader_test = DataLoader(dataset_test, batch_size=256)
trainer.test(dataloaders=dataloader_test)

In [None]:
submit['isFake'] = model.outputs

In [None]:
submit.to_csv('/content/drive/MyDrive/FakeNewsDetection/submit.csv', index=False)