<a href="https://colab.research.google.com/github/haruki-N/FakeNewsDetection/blob/main/make_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **参加コンペ**：[Nishika Fake News Detection](https://www.nishika.com/competitions/27/summary)

# 諸々のinstall

In [None]:
! pip install \
  cytoolz==0.11.2 \
  fugashi==1.1.1 \
  ipadic==1.0.0 \
  mecab-python3==0.996.5 \
  torchtyping==0.1.4 \
  transformers==4.12.2 \
  pytorch-lightning==1.6.3 \
  scikit-learn==1.0.2



# Preparation

In [None]:
# google driveのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import os
import numpy as np
import pandas as pd
import MeCab

import torch

In [None]:
# 再現性
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device(f'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=0)

In [None]:
dir = "/content/drive/MyDrive/FakeNewsDetection"

train_data = pd.read_csv(os.path.join(dir, 'data/train.csv'))
test_data = pd.read_csv(os.path.join(dir, 'data/test.csv'))

In [None]:
train_data.head()

Unnamed: 0,id,isFake,text
0,d19828eb64,1,Cによると、アメリカの元大統領で、最長寿だったジョージ・ウォーカー・ブッシュ氏が27日(C-...
1,dfaab096bd,0,中日新聞によると、コナミカップ・プロ野球アジアシリーズ2007の決勝戦・日本の中日ドラゴンズ...
2,163504bf95,1,愛媛Cは、11月12日にリーグ準加盟の承認を受けて、来期リーグ加盟を目指す愛媛Cに対して、鈴...
3,ed3c9dc579,0,国民日報によると3日、7時50分（UTC+9、日本時間と同じ）大韓民国京畿道平沢市の西海岸（...
4,e06f88267f,1,共同通信によると、5日午後2時過ぎから東京都、神奈川県、千葉県の3都県の広い範囲の地域で停電...


In [None]:
from transformers import BertJapaneseTokenizer

model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

## tokenizeの例

In [None]:
text = train_data['text'].to_list()[0]
tokenizer(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


{'input_ids': [2, 184, 7, 1517, 13, 6, 286, 5, 281, 1579, 12, 6, 113, 14853, 308, 10, 2707, 35, 18597, 28472, 26583, 643, 14, 971, 32, 23, 184, 61, 76, 222, 4441, 2140, 5, 4574, 12, 1589, 15, 10, 8, 11289, 1579, 13, 2707, 35, 472, 35, 11289, 174, 1014, 1367, 1579, 9, 6, 171, 11, 15438, 7, 1878, 19602, 28458, 20, 10, 8, 18597, 643, 9, 1241, 19, 6, 1609, 5, 26596, 756, 2375, 14, 8723, 51, 7, 1589, 15, 10, 13, 5443, 84, 16, 21, 10, 8, 5741, 10, 83, 6, 11289, 1579, 13, 11289, 174, 2556, 1579, 146, 1395, 174, 12, 3171, 29281, 28468, 10, 5, 14, 6, 2707, 35, 15385, 643, 13, 2707, 35, 13411, 643, 5, 25, 53, 8, 15385, 643, 9, 6330, 5569, 643, 23, 115, 35, 4900, 28472, 764, 1150, 28564, 281, 2847, 24, 40, 36, 306, 9, 2340, 19693, 6307, 38, 13, 1653, 20, 16, 33, 8, 8763, 9, 324, 12, 2193, 6330, 5569, 643, 5, 125, 14, 1075, 40, 8508, 10, 451, 12, 130, 6, 6330, 5569, 643, 901, 12, 6155, 28, 6330, 5569, 643, 5, 446, 7, 15388, 16, 21, 80, 8, 8763, 9, 6330, 5569, 643, 5, 4231, 6, 59, 6396, 11, 12136, 

In [None]:
train_data['Tokens'] = train_data['text'].apply(tokenizer.tokenize)
train_data['numTokens'] = train_data['Tokens'].apply(len)
train_data.head()

Unnamed: 0,id,isFake,text,Tokens,numTokens
0,d19828eb64,1,Cによると、アメリカの元大統領で、最長寿だったジョージ・ウォーカー・ブッシュ氏が27日(C-...,"[C, に, よる, と, 、, アメリカ, の, 元, 大統領, で, 、, 最, 長寿,...",718
1,dfaab096bd,0,中日新聞によると、コナミカップ・プロ野球アジアシリーズ2007の決勝戦・日本の中日ドラゴンズ...,"[中日, 新聞, に, よる, と, 、, コナミ, カップ, ・, プロ, 野球, アジア...",426
2,163504bf95,1,愛媛Cは、11月12日にリーグ準加盟の承認を受けて、来期リーグ加盟を目指す愛媛Cに対して、鈴...,"[愛媛, C, は, 、, 11, 月, 12, 日, に, リーグ, 準, 加盟, の, ...",294
3,ed3c9dc579,0,国民日報によると3日、7時50分（UTC+9、日本時間と同じ）大韓民国京畿道平沢市の西海岸（...,"[国民, 日報, に, よる, と, 3, 日, 、, 7, 時, 50, 分, (, UT...",383
4,e06f88267f,1,共同通信によると、5日午後2時過ぎから東京都、神奈川県、千葉県の3都県の広い範囲の地域で停電...,"[共同, ##通信, に, よる, と, 、, 5, 日, 午後, 2, 時, 過ぎ, から...",314


## Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
  def __init__(self, encoded_texts, labels):
    self.encodings = encoded_texts
    self.labels = labels

  def __getitem__(self, idx):
      item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

  def __len__(self):
      return len(self.labels)

In [None]:
from sklearn.model_selection import train_test_split
org_train_texts = train_data.text.to_list()
org_train_labels = train_data.isFake.to_list()

train_texts, valid_texts, train_labels, valid_labels = train_test_split(org_train_texts, org_train_labels, test_size=0.2, random_state=seed)

train_encodings = tokenizer.batch_encode_plus(train_texts, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')
valid_encodings = tokenizer.batch_encode_plus(valid_texts, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

train_dataset = MyDataset(train_encodings, train_labels)
valid_dataset = MyDataset(valid_encodings, valid_labels)

## Model

In [None]:
from transformers import BertConfig, BertForSequenceClassification

config = BertConfig.from_pretrained(
    model_name,
    num_labels=2,            # the number of choices
)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torchmetrics.classification import Accuracy
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AdamW

class LitModel(pl.LightningModule):
  def __init__(self, model, lr=1e-5):
    super().__init__()
    self.lm = model
    self.judge = lambda x: 1 if x >0.5 else 0
    self.acc_metrics = Accuracy()
    self.lr = lr
  
  def forward(self, batch):   # for inference
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']

    output = self.lm(input_ids=input_ids, attention_mask=attention_mask).logits.argmax(dim=1)
    return output

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    output = self.lm(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = output.loss

    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    preds = self(batch)
    acc = self.acc_metrics(preds, labels)
    loss = self.lm(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
    return {"step_val_acc": acc, "step_val_loss": loss}

  def validation_epoch_end(self, outputs):
    avg_acc = torch.stack([step_output['step_valid_acc'] for step_output in outputs]).mean()
    avg_loss = torch.stack([step_output["step_valid_loss"] for step_output in outputs]).mean()

    return {"val_acc": avg_acc, "val_loss": avg_loss}

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    # labels = batch['labels']
    return self.lm(input_ids=input_ids, attention_mask=attention_mask)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.lr)
    scheduler = {'scheduler': ReduceLROnPlateau(optimizer, mode="max", patience=2, factor=0.5, verbose=True), "monitor": "val_acc"}
    print("CF;Ir = ", optimizer.param_groups[0]['lr'])
    return [optimizer], [scheduler]

# Train

## DataLoader

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, worker_init_fn=seed_worker, generator=g)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False, worker_init_fn=seed_worker, generator=g)

In [None]:
trainer = pl.Trainer(gpus=1, max_epochs=8, callbacks=[EarlyStopping(monitor="train_loss", mode="min")])
pl_model = LitModel(model)

trainer.fit(pl_model, train_dataloader, valid_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                          | Params
--------------------------------------------------------------
0 | lm          | BertForSequenceClassification | 110 M 
1 | acc_metrics | Accuracy                      | 0     
--------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.476   Total estimated model params size (MB)


CF;Ir =  1e-05


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# Prediction

In [None]:
print(next(model.parameters()).is_cuda)
if not next(model.parameters()).is_cuda:
  pl_model.to(device)

False


In [None]:
def model_pred(row):
  global pl_model
  global device

  text = row['text']
  encodings = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
  pl_model.eval()
  with torch.no_grad():
    pred = pl_model.forward(encodings).cpu().item()

  return pred


In [None]:
df_prediction = pd.read_csv(os.path.join(dir, 'data/sample_submission.csv'))

df_prediction['isFake'] = test_data.apply(model_pred, axis=1)

In [None]:
df_prediction.head()

Unnamed: 0,id,isFake
0,d253d7b7ac,0
1,fcfe44d0a0,0
2,213caf5cf5,1
3,15aefc8374,1
4,aded40e220,0


## Output model predictions to csv file

In [None]:
file_name = input("csv file name?: ")
df_prediction.to_csv(os.path.join(dir, file_name), index=False)

csv file name?: my_sub_2.csv
