In [None]:
!pip install -q --upgrade wandb GPUtil transformers==4.12.2 fugashi mecab-python3 ipadic colorama pytorch-lightning python-box 

In [None]:
import os
import gc
import copy
import time
import random
import string
from typing import List,Dict,Tuple

import datetime
from datetime import datetime, timedelta, timezone

import math

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig, get_cosine_schedule_with_warmup

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pickle
import re
import unicodedata
from box import Box

import GPUtil
import regex
import scipy as sp
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, AdamW

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping,LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningDataModule, LightningModule

  
warnings.filterwarnings('ignore')

In [None]:
from pathlib import Path
exp_num='001'
output_dir=Path('')
os.makedirs(output_dir, exist_ok=True)

In [None]:
config = {'seed': 2022,
          'root': '',
          'n_splits': 10,
          'max_len': 512,
          'num_labels': 2,
          'model': r'cl-tohoku/bert-base-japanese-whole-word-masking',
          'test_loader': {
              'batch_size': 64,
              'shuffle': False,
              'num_workers': 4,
              'pin_memory': False,
              'drop_last': False
         },
}

config = Box(config)

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(config.seed)

In [None]:
df = pd.read_csv(config.root+"/data/input/test.csv")
df

In [None]:
#label smooth付き
class FakenewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
    
        text = self.text[index]
        inputs_text = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        
        ids = inputs_text['input_ids']
        mask = inputs_text['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
        }

In [None]:
tokenizer=AutoTokenizer.from_pretrained(config.model)
test_dataset=FakenewsDataset(df, tokenizer, config.max_len)
test_loader = DataLoader(test_dataset, **config.test_loader)

In [None]:
class FakenewsModel(nn.Module):
    def __init__(self, model_name):
        super(FakenewsModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=self.transformer.config.hidden_dropout_prob)
        self.output = nn.Linear(768, 2)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, ids, mask):        
        out = self.transformer(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.output(out)
        outputs = self.sigmoid(outputs)
        
        return outputs

In [None]:
preds_list=[]
for fold in range(config.n_splits):
    model=FakenewsModel(config.model)
    weight=output_dir / f'{fold+1}fold_best_metrics.ckpt'
    model.load_state_dict(torch.load(weight)['state_dict'])
    
    preds=[]
    model.to('cuda')
    model.eval()
    with torch.no_grad():
        for i in tqdm(test_loader):
            ids=i['ids'].to('cuda', dtype = torch.long)
            mask=i['mask'].to('cuda', dtype = torch.long)
            output=model(ids, mask)
            output=output.cpu().detach().numpy()

            preds.append(output)
    preds=np.concatenate(preds)
    preds_list.append(preds)


In [None]:
final_pred=np.mean(preds_list, axis=0)

In [None]:
df

In [None]:
submission = pd.read_csv(config.root +"/data/input/sample_submission.csv")

In [None]:
submission['isFake']=np.argmax(final_pred, axis=1)

In [None]:
submission.to_csv(f'{output_dir}/submission.csv', index=False)

In [None]:
submission.isFake.value_counts()

In [None]:
#rawデータも保存
submission_c=submission.copy()
submission_c[['negative_pred', 'positive_pred']]=final_pred
submission_c.to_csv(f'{output_dir}/raw_submission.csv', index=False)