In [None]:
!nvidia-smi -L

In [None]:
!pip install -q --upgrade wandb GPUtil transformers==4.12.2 fugashi mecab-python3 ipadic colorama pytorch-lightning python-box 

In [None]:
import os
import gc
import copy
import time
import random
import string
from typing import List,Dict,Tuple

import datetime
from datetime import datetime, timedelta, timezone

import math

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig, get_cosine_schedule_with_warmup

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pickle
import re
import unicodedata
from box import Box

import GPUtil
import regex
import scipy as sp
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, AdamW

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping,LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningDataModule, LightningModule

  
warnings.filterwarnings('ignore')

In [None]:
import wandb
wandb.login()

In [None]:
#実験番号と出力フォルダを指定
from pathlib import Path
exp_num='001'
output_dir=Path('')
os.makedirs(output_dir, exist_ok=True)

#config作成
config = {'seed': 2022,
          'root': '',
          'n_splits': 10,
          'epoch': 5,
          'max_len': 512,
          'gradient_checkpointing_enable': False,
          'num_labels': 2,
          #'stride': 128,
          'model': r'cl-tohoku/bert-base-japanese-whole-word-masking',
          'trainer': {
              'gpus': 1,
              'accumulate_grad_batches':2,
            #   'progress_bar_refresh_rate': 1,
              'fast_dev_run': False,
              'num_sanity_val_steps': 0,
              'resume_from_checkpoint': None,
              'deterministic':True,
              'val_check_interval': 1.0,
              #'precision': 16
          },

          'train_loader':{
              'batch_size': 16,
              'shuffle': True,
              'num_workers': 4,
              'pin_memory': False,
              'drop_last': True,
          },

          'val_loader': {
              'batch_size': 64,
              'shuffle': False,
              'num_workers': 4,
              'pin_memory': False,
              'drop_last': False
         },
          
          'optimizer':{
              'name': 'optim.AdamW',
              'params':{
                  'lr': 2.5e-5
              },
          },
        #   'scheduler':{
        #       'name': 'get_cosine_schedule_with_warmup',
        #   },
          #logitを入力として、CEを計算
          #BCE logit losssも入力はlogit
          'loss': 'nn.BCEWithLogitsLoss',
          'logger':{
              'project':'FakeNews',
              'group':f'exp{exp_num}'              
          },
          'params_dir':f'{output_dir}'
}

config = Box(config)

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(config.seed)

In [None]:
df = pd.read_csv(config.root+"/data/input/train.csv")
df

In [None]:
df["isFake"].value_counts()

In [None]:
#簡単な統計データ
df['textlen']=df['text'].apply(lambda x:len(x))
df.groupby('isFake')['textlen'].agg(['mean', 'std', 'max', 'min'])

In [None]:
skf = StratifiedKFold(n_splits=config.n_splits, shuffle=True, random_state=config.seed)

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df["isFake"])):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

In [None]:
#label smooth付き
class FakenewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        self.target = df['isFake'].values
        self.alpha=0.0025
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
    
        text = self.text[index]
        inputs_text = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
    
        target = self.target[index]
        
        onehot_t = np.zeros(config.num_labels, dtype=np.float32) + self.alpha ### Label smoothing
        onehot_t[target] = 1 - 2*self.alpha
        
        ids = inputs_text['input_ids']
        mask = inputs_text['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'target': torch.tensor(onehot_t, dtype=torch.float)
        }

In [None]:
class FakeNewsModel(pl.LightningModule):
    def __init__(self, config, t_dataloader, v_dataloader):
        super().__init__()
        self.cfg=config
        self.save_hyperparameters(config)

        #dataloader
        self._train_dataloader=t_dataloader
        self._valid_dataloader=v_dataloader

        # hidden_dropout_prob: float = 0.1
        # layer_norm_eps: float = 1e-7

        transformer_config = AutoConfig.from_pretrained(self.cfg.model)

        self.transformer = AutoModel.from_pretrained(self.cfg.model, config=transformer_config)
        
        if self.cfg.gradient_checkpointing_enable:
            self.transformer.gradient_checkpointing_enable()

        # #reinit2
        # for layer in self.transformer.encoder.layer[-2:]:
        #     for module in layer.modules():
        #         self._init_weights(module)

        
        self.dropout = nn.Dropout(transformer_config.hidden_dropout_prob)
        self.output = nn.Linear(transformer_config.hidden_size, self.cfg.num_labels)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.transformer.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.transformer.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 1e-6,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 1e-6,
            },
        ]
        optimizer = eval(self.cfg.optimizer.name)(optimizer_parameters, lr=self.cfg.optimizer.params.lr)

        num_training_steps=math.ceil(len(self._train_dataloader)/self.cfg.trainer.accumulate_grad_batches)*self.cfg.epoch

        scheduler = lr_scheduler.OneCycleLR(optimizer,max_lr=self.cfg.optimizer.params.lr, total_steps=num_training_steps)

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
            "scheduler": scheduler,
            'interval': 'step'
            },
        }

    #logitを出力 推論時はsoftmax必要
    def forward(self, batch: List[Dict]):
        logits = self.transformer(input_ids=batch['ids'], attention_mask=batch['mask'])
        return logits[1]

    def _loss(self, out, label):

        #loss計算
        loss_fct = eval(self.cfg.loss)()
        loss = loss_fct(out, label)
        return loss

    def training_step(self, batch, batch_idx):
        out=self.forward(batch)
        out = self.dropout(out)

        label=batch['target']

        out=self.output(out)

        loss=self._loss(out, label)
        self.log_dict({"train_loss":loss}, on_step=True, prog_bar=True)

        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        out=self.forward(batch)
        out=self.output(out)
        out=torch.argmax(out, axis=1)

        label=batch['target']
        label=torch.argmax(label, axis=1)
        return {"preds": out, 'labels': label}  
    
    def validation_epoch_end(self, outputs):
        preds = [out['preds'] for out in outputs]
        labels =  [out['labels'] for out in outputs]

        preds = torch.hstack(preds).to('cpu').detach().numpy()
        labels = torch.hstack(labels).to('cpu').detach().numpy()

        f1 = f1_score(labels, preds, average="binary")
        acc = accuracy_score(labels, preds)
        self.log(f'valid_f1', f1)
        self.log(f'valid_acc', acc)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._valid_dataloader



# train

In [None]:
tokenizer=AutoTokenizer.from_pretrained(config.model)

In [None]:
for fold in range(config.n_splits):

    print(fold)

    set_seed(config.seed)
    
    train_df = df[df["kfold"] != fold].reset_index(drop=True)
    valid_df = df[df["kfold"] == fold].reset_index(drop=True)

    train_dataset = FakenewsDataset(train_df, tokenizer, config.max_len)
    val_dataset = FakenewsDataset(valid_df, tokenizer, config.max_len)

    train_dataloader=DataLoader(train_dataset, **config.train_loader)
    val_dataloader=DataLoader(val_dataset, **config.val_loader)

    model = FakeNewsModel(config, train_dataloader, val_dataloader)

    lr_monitor = LearningRateMonitor()
    loss_checkpoint = ModelCheckpoint(
        dirpath=config.params_dir,
        filename=f"{fold+1}fold_best_metrics",
        monitor="valid_acc",
        #save_weights_only=True,
        save_top_k=1,
        mode="max",
        save_last=False,
    )

    wandb_logger = WandbLogger(
                            project=config.logger.project,
                            group='exp' + exp_num,
                            name=f'{fold+1}fold'
                        )

    trainer = pl.Trainer(
        logger=wandb_logger,
        max_epochs=config.epoch,
        callbacks=[
                   lr_monitor, 
                   loss_checkpoint, 
                   ],
        **config.trainer,
    )

    trainer.fit(model)

    wandb.finish()

    #RAM確保
    del model
    gc.collect()
    torch.cuda.empty_cache()


## 予測値保存

In [None]:
class FakenewsInferenceModel(nn.Module):
    def __init__(self, model_name):
        super(FakenewsInferenceModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.1)
        self.output = nn.Linear(self.transformer.config.hidden_size, 2)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, ids, mask):        
        out = self.transformer(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.output(out)
        outputs = self.sigmoid(outputs)
        
        return outputs

In [None]:
tokenizer=AutoTokenizer.from_pretrained(config.model)

In [None]:
valid_dfs=[]
for fold in range(config.n_splits):
   
    valid_df = df[df["kfold"] == fold].reset_index(drop=True)
    val_dataset = FakenewsDataset(valid_df, tokenizer, config.max_len)
    val_dataloader=DataLoader(val_dataset, **config.val_loader)
   
    model=FakenewsInferenceModel(config.model)
    weight=output_dir / f'{fold+1}fold_best_metrics.ckpt'
    model.load_state_dict(torch.load(weight)['state_dict'])
    
    preds=[]
    model.to('cuda')
    model.eval()
    with torch.no_grad():
        for i in tqdm(val_dataloader):
            ids=i['ids'].to('cuda', dtype = torch.long)
            mask=i['mask'].to('cuda', dtype = torch.long)
            output=model(ids, mask)
            output=output.cpu().detach().numpy()

            preds.append(output)
    preds=np.concatenate(preds)

    valid_df[['negative_pred', 'positive_pred']]=preds

    valid_dfs.append(valid_df)


    

In [None]:
 valid_df=pd.concat(valid_dfs)
 valid_df.groupby(['isFake']).agg({'positive_pred':['min', 'max', 'median', 'mean']})

In [None]:
pred=np.argmax(valid_df[['negative_pred', 'positive_pred']].values, axis=1)

In [None]:
valid_acc=accuracy_score(valid_df.isFake.values, pred).round(4)
valid_acc

In [None]:
valid_df.to_csv(f'{output_dir}/acc{valid_acc}_valid_pred.csv', index=False)