In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
%cd /content/drive/MyDrive/FakeNewsDetection

/content/drive/MyDrive/FakeNewsDetection


In [None]:
!pip install transformers==4.18.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.6.1

import random
import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

# 日本語の事前学習モデル
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'



In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("/content/drive/MyDrive/FakeNewsDetection/train.csv")
test = pd.read_csv("/content/drive/MyDrive/FakeNewsDetection/test.csv")

submit = pd.read_csv("/content/drive/MyDrive/FakeNewsDetection/sample_submission.csv")

In [None]:
def make_dataset(df, max_length, tokenizer):
  dataset = []
  for index, row in df.iterrows():
    encoding = tokenizer(
        row['text'],
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    encoding['labels']=row['isFake']
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset.append(encoding)
  return dataset

In [None]:
max_length = 128
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

dataset_for_loader = make_dataset(train, max_length, tokenizer)
random.shuffle(dataset_for_loader)
n = len(dataset_for_loader)
n_train = int(0.7*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:]
# dataset_test = make_dataset(test, max_length, tokenizer)

dataloader_train = DataLoader(
    dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
# dataloader_test = DataLoader(dataset_test, batch_size=256)

In [None]:
class BertForSequenceClassification_pl(pl.LightningModule):
        
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters() 
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        self.outputs = []
        
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
        
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def test_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        self.outputs.extend(labels_predicted.tolist())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [None]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

trainer = pl.Trainer(
    gpus=1, 
    max_epochs=10,
    callbacks = [checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=2, lr=1e-5
)

trainer.fit(model, dataloader_train, dataloader_val) 

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
best_model_path = checkpoint.best_model_path
print('ベストモデルのファイル: ', checkpoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkpoint.best_model_score)

ベストモデルのファイル:  /content/drive/MyDrive/FakeNewsDetection/model/epoch=7-step=664.ckpt
ベストモデルの検証データに対する損失:  tensor(0.0718, device='cuda:0')


In [None]:
def make_dataset_test(df, max_length, tokenizer):
  dataset = []
  for index, row in df.iterrows():
    encoding = tokenizer(
        row['text'],
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset.append(encoding)
  return dataset

In [None]:
dataset_test = make_dataset_test(test, max_length, tokenizer)
dataloader_test = DataLoader(dataset_test, batch_size=256)
trainer.test(dataloaders=dataloader_test)

In [None]:
submit['isFake'] = model.outputs

In [None]:
submit.to_csv('/content/drive/MyDrive/FakeNewsDetection/submit.csv', index=False)

In [None]:
!nvidia-smi

Sat May  7 08:15:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    71W / 149W |    590MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
!git init

Initialized empty Git repository in /content/drive/MyDrive/FakeNewsDetection/.git/


In [19]:
!git add .

!git config --global user.email "ryoyanagi0813@gmail.com"
!git config --global user.name "r-yanagimoto"

!git commit -m "BERTで実装"

error: open("train.gsheet"): Operation not supported
error: unable to index file train.gsheet
fatal: adding files failed
On branch master

Initial commit

Untracked files:
	[31mFakeNewsDetection.ipynb[m
	[31mdata_explanation.xlsx[m
	[31mlightning_logs/[m
	[31mmodel/[m
	[31msample_submission.csv[m
	[31msubmit.csv[m
	[31mtest.csv[m
	[31mtrain.csv[m
	[31mtrain.gsheet[m

nothing added to commit but untracked files present


In [None]:
!git remote set-url origin https://ryo3568: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@github.com/ryo3568/BERT_introduction.git
!git push origin r-yanagimoto