In [None]:
# download files for sentiment classification
from requests import get

def download(url, filename):
    with open(filename, "wb") as file:
        response = get(url)
        file.write(response.content)

download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", "ratings_train.txt")
download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", "ratings_test.txt")

# print first 5 lines of the file
with open("ratings_train.txt", "r") as file:
    for i in range(5):
        print(file.readline())

# build a vocabulary with training data
with open("ratings_train.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    train_data = [line.split("\t") for line in lines if len(line) > 0]

with open("ratings_test.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    test_data = [line.split("\t") for line in lines if len(line) > 0]

vocab = {"[PAD]":0, "[UNK]":1}
vocab_idx = 2
for data in train_data:
    line = data[1]
    for char in line:
        if char not in vocab:
            vocab[char] = vocab_idx
            vocab_idx += 1

id	document	label

9976970	아 더빙.. 진짜 짜증나네요 목소리	0

3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1

10265843	너무재밓었다그래서보는것을추천한다	0

9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 32)
        self.fc1 = nn.Linear(32 * 100, 100)
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, 32 * 100)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.1-py3-none-any.whl (801 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning, lightning
Successfully installed lightning-2.2.1 lightning-utilities-0.10.1 pytorch-lightning-2.2.1 torchmetrics-1.3.1


## LightningModule 클래스

### 1) 모델의 기본적인 구조 정의
- 기존 모델 초기화하듯이 그대로 사용 가능 (실제로 아래에서 기존에 사용하던 모델을 그대로 파라미터로 받음)
- forward 정의 부분도 그대로 사용 가능
- 손실함수도 클래스 내부에 정의해서 사용하는 것이 구조화되어 좋다

### 2) 모델의 학습 루프

- 복잡하게 작성하던 내용을 추상화한 부분
- 패턴이 있음 -> 3가지 메소드
- `__step`: 스텝마다
- `__step_end`: 스텝 종료
- `__epoch_end`: 1 epoch 종료

In [None]:
import lightning as pl

class SentimentClassifierPL(pl.LightningModule):
    def __init__(self, sentiment_classifier):
        super(SentimentClassifierPL, self).__init__()
        self.model = sentiment_classifier
        self.loss = nn.CrossEntropyLoss()

        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("val_loss", loss)
        self.validation_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("val_accuracy", accuracy)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("test_loss", loss)
        self.test_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_test_epoch_end(self):
        outputs = self.test_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_test_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("test_accuracy", accuracy)
        self.test_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer

In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = int(self.data[index][2])
        line = self.data[index][1]
        # convert characters to indices with unk token
        line = [self.vocab.get(char, 1) for char in line]

        if len(line) > 100:
            line = line[:100]
        else:
            line = line[:100] + [0] * (100 - len(line))

        return torch.tensor(line), torch.tensor(label)

train_dataset = SentimentDataset(train_data, vocab)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)

val_dataset = SentimentDataset(test_data, vocab)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

test_dataset = SentimentDataset(test_data, vocab)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)



In [None]:
sentcls = SentimentClassifier(len(vocab))
PLSentimentClassifier = SentimentClassifierPL(sentcls)

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'sentiment_classifier' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['sentiment_classifier'])`.


In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.2/2.2 MB[0m [31m38.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.41.0-py2.py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.8/258.8 kB[0m [31m32.9 MB/s[0

In [None]:
import wandb
from lightning.pytorch.loggers import WandbLogger

wandb.login()
wandb_logger = WandbLogger(project="NLP", name="Lec01_sentiment_classification_w_pl")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint(monitor='val_loss',
                             dirpath="checkpoints",
                             filename="sentiment-classifier-{epoch:02d}-{val_loss:.2f}",
                             verbose=True)

In [None]:
trainer = pl.Trainer(max_epochs=3,
                     accelerator="gpu",
                     callbacks=[early_stopping, checkpoint],
                     logger=wandb_logger
                     ) # see https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=PLSentimentClassifier,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

[34m[1mwandb[0m: Currently logged in as: [33mjaewoo010207[0m ([33mjaewoogwak[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                | Params
----------------------------------------------
0 | model | SentimentClassifier | 416 K 
1 | loss  | CrossEntropyLoss    | 0     
----------------------------------------------
416 K     Trainable params
0         Non-trainable params
416 K     Total params
1.666     Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name  | Type                | Params
----------------------------------------------
0 | model | SentimentClassifier | 416 K 
1 | loss  | CrossEntropyLoss    | 0     
----------------------------------------------
416 K     Trainable params
0         Non-trainable params
416 K     Total params
1.666     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 0, global step 2344: 'val_loss' reached 0.49325 (best 0.49325), saving model to '/content/checkpoints/sentiment-classifier-epoch=00-val_loss=0.49.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 0, global step 2344: 'val_loss' reached 0.49325 (best 0.49325), saving model to '/content/checkpoints/sentiment-classifier-epoch=00-val_loss=0.49.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 1, global step 4688: 'val_loss' reached 0.47783 (best 0.47783), saving model to '/content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 1, global step 4688: 'val_loss' reached 0.47783 (best 0.47783), saving model to '/content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 2, global step 7032: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 2, global step 7032: 'val_loss' was not in top 1
INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
trainer.test(dataloaders=test_loader)

INFO: Restoring states from the checkpoint path at /content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at /content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at /content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at /content/checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.477825790643692,
  'avg_test_loss': 0.47783493995666504,
  'test_accuracy': 0.7739599943161011}]

In [None]:
best_model = SentimentClassifierPL.load_from_checkpoint("./checkpoints/sentiment-classifier-epoch=01-val_loss=0.48.ckpt",
                                                        sentiment_classifier=SentimentClassifier(len(vocab)))
trainer.test(best_model, test_loader)

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'sentiment_classifier' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['sentiment_classifier'])`.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.47782576084136963,
  'avg_test_loss': 0.47783493995666504,
  'test_accuracy': 0.7739599943161011}]