In [1]:
from typing import Iterable, List, Optional

import torch
import torch.nn as nn

from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import pytorch_lightning as pl 
from pytorch_lightning import seed_everything, Trainer
from sklearn.model_selection import StratifiedKFold, train_test_split
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pandas as pd
from torch.utils.data import Dataset,DataLoader



In [2]:
class cfg:
    DIR_NAME='/Data/POI_classification/'
    TRAIN_CSV='/Data/POI_classification/train.csv'
    TEST_CSV='/Data/POI_classification/test.csv'
    TRAIN_IMAGE_DIR='/Data/POI_classification/image/train/'
    TEST_IMAGE_DIR='/Data/POI_classification/image/test/'
    SAMPLE_SUBMISSION='/Data/POI_classification/sample_submission.csv'
    PRETRAINED_PATH='bert-base-multilingual-cased'
    MAX_SEQ_LENGTH=152
    MAX_TOKEN_LEN=200
    SEED=42
    NUM_CLASSES=3
    HIDDEN_SIZE=512
    DROPOUT=0.2
    N_TRAINING_DATA=-1
    ACCUMULATE_GRAD_BATCHES=1
    NUM_TRAIN_EPOCHS=1
    WEIGHT_DECAY=0.01
    LEARNING_RATE=5e-5
    ADAM_EPSILON=1e-8
    N_WARMUP_STEP=0

In [3]:
seed_everything(cfg.SEED, workers=True)

Global seed set to 42


42

In [4]:
train_df = pd.read_csv(cfg.TRAIN_CSV)

In [5]:
train_df = train_df[train_df.cat3.isin(['한식','야영장,오토캠핑장','바/까페'])].sample(30,random_state=42)

In [6]:
mapping = {'바/까페':0, '한식':1, '야영장,오토캠핑장':2}

In [7]:
train_df['cat3_encode'] = train_df['cat3'].map(mapping)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(cfg.PRETRAINED_PATH)

In [9]:
train, valid = train_test_split(train_df, train_size=0.8, random_state=cfg.SEED)

In [10]:
class OverviewData(Dataset):
    def __init__(self, data, tokenizer:AutoTokenizer):
        self.examples = None
        self.data = data
        self.tokenizer = tokenizer
        self.processing()
    def processing(self):
        processed_data  = {}
        texts = [[x] for x in self.data.overview.values]
        self.examples = self.tokenizer(texts,
                                       truncation=True,
                                       padding='max_length',
                                       max_length=cfg.MAX_TOKEN_LEN,
                                       is_split_into_words=True,
                                       return_tensors='pt')
        self.examples['labels'] = self.data.cat3_encode.values
        self.examples['texts'] = self.data.overview.values
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, index):
        return {
            'input_ids': self.examples['input_ids'][index],
            'attention_mask': self.examples['attention_mask'][index],
            'label': self.examples['labels'][index],
        }

In [11]:
train_dataset = OverviewData(train, tokenizer)
valid_dataset = OverviewData(valid, tokenizer)

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=1)
val_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False,  num_workers=1)

In [13]:
cfg.N_TRAINING_DATA = len(train_dataset)

In [14]:
class CLSModel(pl.LightningModule):

    def __init__(self, cfg):
        super().__init__()

        self.cfg = cfg
        self.num_classes = cfg.NUM_CLASSES
        self.model = AutoModel.from_pretrained(cfg.PRETRAINED_PATH)
        self.hidden_size = self.model.config.hidden_size
        self.dropout_prob = self.model.config.hidden_dropout_prob
        self.classifier = nn.Sequential(
                            nn.Linear(self.hidden_size, cfg.NUM_CLASSES))
        # self.init_weights()
        print('--Init--')
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        print('forward')
        output = self.model(input_ids, attention_mask)
        logits = self.classifier(output.pooler_output)
    
        loss = 0
        if labels:
            loss = self.calc_loss(logits, labels)
        return loss, logits

    def calc_loss(self, logits, labels):
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return loss
            
    def training_step(self, train_batch, batch_idx):
        input_ids =  train_batch["input_ids"]
        # print(input_ids)
        attention_mask = train_batch["attention_mask"]
        labels = train_batch["label"]
        print('hihe')
        loss, logits = self(input_ids, attention_mask, labels)
        predictions = logits.argmax(1)
        self.log('train_loss', loss)

        return {"loss": loss, "predictions": predictions, "labels": labels}
    
    def validation_step(self, val_batch, batch_idx):
        input_ids =  val_batch["input_ids"]
        attention_mask = val_batch["attention_mask"]
        labels = val_batch["label"]
        loss, logits = self(input_ids, attention_mask, labels)
        predictions = logits.argmax(1)
        self.log('val_loss', loss)

        return  {"loss": loss, "predictions": predictions, "labels": labels}

    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output['labels'].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output['prediction'].detach().cpu():
                predictions.append(out_prediction)
        labels = torch.stack(labels)
        predictions = torch.stack(predictions)
        score = f1_score(labels, predictions, average='weighted')
        
        self.log('train f1-score', score)
        return loss,score

    def validation_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output['labels'].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output['prediction'].detach().cpu():
                predictions.append(out_prediction)
        labels = torch.stack(labels)
        predictions = torch.stack(predictions)
        score = f1_score(labels, predictions, average='weighted')
        
        self.log('valid f1-score', score)
        
        return loss, score
    
    def configure_optimizers(self):

        total_step = self.cfg.N_TRAINING_DATA // self.cfg.ACCUMULATE_GRAD_BATCHES * self.cfg.NUM_TRAIN_EPOCHS
        
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.cfg.WEIGHT_DECAY,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, 
                          lr=self.cfg.LEARNING_RATE, 
                          eps=self.cfg.ADAM_EPSILON)
        scheduler =  get_linear_schedule_with_warmup(optimizer, 
                                                     num_warmup_steps=self.cfg.N_WARMUP_STEP,
                                                     num_training_steps=total_step)
        
        return [optimizer], [scheduler]

In [15]:
model = AutoModel.from_pretrained(cfg.PRETRAINED_PATH)
output = model(train_dataset[:]['input_ids'], train_dataset[:]['attention_mask'])

In [16]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
tb_logger = TensorBoardLogger('logs/')

In [17]:
checkpoint_callback = ModelCheckpoint(
        dirpath='models',
        save_top_k=1,
        monitor='valid f1-score',
        mode='max',
        save_weights_only=True,
        filename='{epoch}',
    )

In [18]:
model = CLSModel(cfg)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


--Init--


In [None]:
trainer = Trainer(
        logger=tb_logger,
        accelerator='cpu',
        callbacks=[checkpoint_callback],
        gradient_clip_val=1,
        accumulate_grad_batches=cfg.ACCUMULATE_GRAD_BATCHES,
        max_epochs=cfg.NUM_TRAIN_EPOCHS,
        precision=16,
        log_every_n_steps=1,
        num_sanity_val_steps=0
    )
    
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=[val_dataloader])

  rank_zero_warn(
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: logs/lightning_logs

  | Name       | Type       | Params
------------------------------------------
0 | model      | BertModel  | 177 M 
1 | classifier | Sequential | 2.3 K 
------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]