In [1]:
!pip install -q wandb transformers fugashi unidic_lite pytorch-lightning==1.4.0 python-box 

[K     |████████████████████████████████| 1.7 MB 15.9 MB/s 
[K     |████████████████████████████████| 3.4 MB 59.1 MB/s 
[K     |████████████████████████████████| 490 kB 53.5 MB/s 
[K     |████████████████████████████████| 47.4 MB 1.9 MB/s 
[K     |████████████████████████████████| 913 kB 46.9 MB/s 
[K     |████████████████████████████████| 596 kB 48.1 MB/s 
[K     |████████████████████████████████| 132 kB 20.4 MB/s 
[K     |████████████████████████████████| 829 kB 15.7 MB/s 
[K     |████████████████████████████████| 332 kB 48.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.4 MB/s 
[K     |████████████████████████████████| 97 kB 3.7 MB/s 
[K     |████████████████████████████████| 140 kB 49.7 MB/s 
[K     |████████████████████████████████| 180 kB 15.6 MB/s 
[K     |████████████████████████████████| 63 kB 1.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.3 MB/s 
[K     |████████████████████████████████| 895 kB 11.9 MB/s 
[K     |███████████████████

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import gc
import warnings
from pprint import pprint
from glob import glob
from tqdm import tqdm
import random
import math

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from box import Box
from sklearn.model_selection import StratifiedKFold, train_test_split
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping,LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningDataModule, LightningModule

import transformers
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

import wandb

warnings.filterwarnings("ignore")

In [4]:
#gpuの確認
!nvidia-smi -L

#もしもA100を引いた場合は以下を実行
#参考：https://github.com/googlecolab/colabtools/issues/2452
!pip install https://storage.googleapis.com/jax-releases/cuda111/jaxlib-0.1.71+cuda111-cp37-none-manylinux2010_x86_64.whl

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-aea07922-b94a-c010-8956-589ca2632096)
Collecting jaxlib==0.1.71+cuda111
  Downloading https://storage.googleapis.com/jax-releases/cuda111/jaxlib-0.1.71+cuda111-cp37-none-manylinux2010_x86_64.whl (197.3 MB)
[K     |████████████████████████████████| 197.3 MB 70 kB/s 


In [5]:
#wandbにログイン
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: [32m[41mERROR[0m No API key specified.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(2021)

In [7]:
#filenameとfilepathを取得
from requests import get
import subprocess
from pathlib import Path

filename = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split('.')[0]
filepath=subprocess.run(f"<フォルダ名> -name *{filename}*", capture_output=True,shell=True, text=True)
params_dir=Path(str(Path(filepath.stdout.strip()).parent)+'/params')

# Config

In [8]:
config = {'seed': 2021,
          'epoch': 10,
          'trainer': {
              'gpus': 1,
            #   'progress_bar_refresh_rate': 1,
              'fast_dev_run': False,
              'num_sanity_val_steps': 0,
              'resume_from_checkpoint': None,
              'deterministic':True,
              'val_check_interval': 0.25
          },
          'train_loader':{
              'batch_size': 8,
              'shuffle': True,
              'num_workers': 4,
              'pin_memory': False,
              'drop_last': True,
          },
          'val_loader': {
              'batch_size': 32,
              'shuffle': False,
              'num_workers': 4,
              'pin_memory': False,
              'drop_last': False
         },
          'model':{
              'model_path': 'cl-tohoku/bert-base-japanese-v2',
              'weight_decay': 1e-2,
              'learning_rate': 1e-5,
              'warmup_ratio': 0.1,
              'gradient_accumulation_steps': 8
          },
          'logger':{
              'project':'いいね予測君',
              'group':f'{filename}'              
          },
          'params_dir':f'{params_dir}'
}

config = Box(config)

# Prepare Data

In [9]:
#data
df=pd.read_csv('./preprocess_tweet_data.csv')

In [10]:
#アンダーサンプリング
SAMPLE_NUM=50000

nolike_index=list(df[df.like_count == 0].sample(n=SAMPLE_NUM//2, random_state=2021).index)
like_index=list(df[df.like_count != 0].sample(n=SAMPLE_NUM-len(nolike_index), random_state=2021).index)

df=df.loc[nolike_index + like_index].reset_index()

In [None]:
class TweetLikePredDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        self.tokenizer = tokenizer

        self.encode_text=tokenizer(
            text=self.data.text.tolist(),
            return_attention_mask=True,
            truncation=True,
            max_length=192,
            padding='max_length'
            )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        items={
            'input_ids' : torch.tensor(self.encode_text['input_ids'][idx]),
            'attention_mask' : torch.tensor(self.encode_text['attention_mask'][idx]),
            'tweet_hour' : torch.tensor(self.data['tweet_hour'][idx], dtype=torch.float32),
            'followers_count' : torch.tensor(self.data['followers_count'][idx], dtype=torch.float32),
            'like_count' : torch.tensor(self.data['like_count'][idx], dtype=torch.float32)
        }
    
        return items


# Model

In [None]:
class TweetLikePredModel(pl.LightningModule):
    def __init__(
        self,
        tokenizer,
        cfg,
        t_dataloader,
        v_dataloader
    ):
        #superで親クラスのメソッドを使用。
        super().__init__()
        #gradient_accumulateのため、マニュアル
        self.automatic_optimization = False

        #config
        self.weight_decay=cfg.model.weight_decay
        self.learning_rate=cfg.model.learning_rate
        self.epoch=cfg.epoch
        self.warmup_ratio=cfg.model.warmup_ratio
        self.gradient_accumulation_steps=cfg.model.gradient_accumulation_steps

        #tokenizer
        self.tokenizer=tokenizer

        #model
        self.model_config=AutoConfig.from_pretrained(cfg.model.model_path)
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.1
            }
        )
        self.model=AutoModel.from_pretrained(cfg.model.model_path,config=self.model_config)

        
        self.regressor = nn.Sequential(
            nn.Linear(self.model_config.hidden_size+2, 128),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

        #dataloader
        self._train_dataloader=t_dataloader
        self._valid_dataloader=v_dataloader

        #save_hyperparameter
        self.save_hyperparameters(cfg)

    #AdamWとlinearスケジューラを基本使用
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            ]
        optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=self.learning_rate,
            )
        
        num_training_steps=math.ceil(len(self._train_dataloader)/self.gradient_accumulation_steps)*self.epoch
        num_warmup_steps=num_training_steps*self.warmup_ratio

        scheduler=get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=num_training_steps,
            num_warmup_steps=num_warmup_steps,
        )

        return {'optimizer':optimizer,'lr_scheduler':scheduler}

    #推論の時も使う処理を記載
    def forward(self, x):

        input_ids=x['input_ids']
        attention_mask=x['attention_mask']

        #その他特徴量
        tweet_hour=x['tweet_hour'].reshape(-1,1)
        followers_count=x['followers_count'].reshape(-1,1)
        
        out=self.model(input_ids,attention_mask)
        #pooler output = CLSトークンのemb層を抽出
        out = out[1]
        out =  torch.cat([out, tweet_hour, followers_count], dim=1)
        #batch*1で予測
        qa_logits=self.regressor(out)

        return qa_logits

    #gradient_accumulateを加味しているため、マニュアルbackward
    def training_step(self,batch, batch_idx):
    
        opt = self.optimizers()
        sch = self.lr_schedulers()

        logits = self.forward(batch)
        labels = batch['like_count']
        loss = nn.MSELoss()(logits, labels)

        self.log("train_step_loss", loss, prog_bar=True)

        #if average 
        loss = loss / self.gradient_accumulation_steps

        #backward
        self.manual_backward(loss)

         # accumulate gradients of `n` batches
        if (batch_idx + 1) % self.gradient_accumulation_steps == 0:
            opt.step()
            sch.step()
            opt.zero_grad()
        
        return {'logits': logits, 'labels': labels}

    def validation_step(self, batch, batch_idx):
        
        logits = self.forward(batch)
        labels = batch['like_count']

        loss = nn.MSELoss()(logits, labels)

        self.log("val_step_loss", loss, prog_bar=True)

        return {'logits': logits, 'labels': labels}

    #epoch終わりのloss計算
    def training_epoch_end(self, training_step_outputs):
        self._share_epoch_end(training_step_outputs,'train')

    def validation_epoch_end(self,val_step_outputs):
        self._share_epoch_end(val_step_outputs,'val')

    def _share_epoch_end(self, outputs, mode):
        all_logits = []
        all_labels = []
        for out in outputs:
            logits, labels = out['logits'], out['labels']
            all_logits.append(logits)
            all_labels.append(labels)
        all_logits = torch.cat(all_logits)
        all_labels = torch.cat(all_labels)
        loss = nn.MSELoss()(all_logits, all_labels)

        self.log(f'{mode}_epoch_loss', loss, prog_bar=True)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._valid_dataloader

    

# Train

In [None]:
#Bertになってる。
tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)

In [None]:
skf = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=2021
)

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["like_count"])):

    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)

    train_dataset=TweetLikePredDataset(train_df, tokenizer)
    valid_dataset=TweetLikePredDataset(val_df, tokenizer)

    train_dataloader=DataLoader(train_dataset, **config.train_loader)
    val_dataloader=DataLoader(train_dataset, **config.val_loader)

    model=TweetLikePredModel(
        tokenizer,
        config,
        train_dataloader,
        val_dataloader
    )

    checkpoint_callback = ModelCheckpoint(monitor='val_epoch_loss',
                                save_top_k=1,
                                save_weights_only=True,
                                dirpath=config.params_dir,
                                filename=f'{fold+1}fold_best_loss',
                                verbose=False,
                                mode='min',
                                save_last=False)
    
    #LRmonitor
    learning_rate_monitor=LearningRateMonitor(logging_interval='step')

    #early stopping
    early_stopping = EarlyStopping(monitor='val_epoch_loss',mode='min',patience=6)
    
    #kfoldはgroup内で一元管理。
    wandb_logger = WandbLogger(project=config.logger.project,
                            name=f'{fold+1}fold',
                            group=config.logger.group
                            )

    trainer = pl.Trainer(logger=wandb_logger,
                        max_epochs=config.epoch,
                        checkpoint_callback=True, 
                        callbacks=[checkpoint_callback,learning_rate_monitor,early_stopping],
                        **config.trainer
                        )
    
    print('training start')
    trainer.fit(model)
    #trainer.validate(model)

    #これを実行しないと、新しいものが立ち上がらない
    wandb.finish()

    #RAM確保
    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    #モデルは一つだけを保存
    break

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


training start



  | Name      | Type       | Params
-----------------------------------------
0 | model     | BertModel  | 111 M 
1 | regressor | Sequential | 98.8 K
-----------------------------------------
111 M     Trainable params
0         Non-trainable params
111 M     Total params
445.224   Total estimated model params size (MB)


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
lr-AdamW/pg1,▁▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▁▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_epoch_loss,▁▂▁▂▁▃▁▁█▂
train_step_loss,▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_epoch_loss,█▅▄▃▂▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_step_loss,█▅▄▃▂▁▁▂▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,9.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train_epoch_loss,51.04524
train_step_loss,51.04523
trainer/global_step,49999.0
val_epoch_loss,30076.05078
val_step_loss,30074.69141
