In [6]:
model = "cnn"

out_path = f"pred/{model}/"

data = {
    1 : {
        "model": "../tensorboard_checkpoint/cnn/2023-04-25_09-07-51_scenario_1/checkpoints/val_epoch=5-validation_loss=0.2260.ckpt",
        "dataset":"original/1.csv",
    },
    3 : {
        "model": "../tensorboard_checkpoint/cnn/2023-04-25_09-07-51_scenario_3/checkpoints/val_epoch=2-validation_loss=0.2587.ckpt",
        "dataset":"original/3.csv",
    },
    5 : {
        "model": "../tensorboard_checkpoint/cnn/2023-04-25_09-07-51_scenario_5/checkpoints/val_epoch=2-validation_loss=0.2576.ckpt",
        "dataset":"original/5.csv",
    },
}

In [7]:
# Load dataset to df
import pandas as pd
for i in data:
    data[i]['df'] = pd.read_csv(data[i]['dataset'])

In [8]:
# Hyperparameters
hyper_params = {
    'seq_length': 256,
    'out_feature': 2,
    'learning_rate': 1e-3,
    'batch_size': 16,
    'conv_num_filters': 100,
    'conv_kernels': (1,2,3)
}

# https://ieeexplore.ieee.org/document/8577620
# https://ieeexplore.ieee.org/abstract/document/8589431
# https://ieeexplore.ieee.org/document/8691381
# https://aclanthology.org/D14-1181.pdf

import math
import torch
import torch.nn as nn
import torchnlp.nn as nnlp
import lightning.pytorch as pl
import torch.nn.functional as F

from torch.optim.lr_scheduler import ExponentialLR
from torchnlp.word_to_vector import FastText
from torchmetrics.classification import F1Score, Accuracy, Recall, Precision

class CNNClassifier(pl.LightningModule):
    def __init__(
        self,
        seq_length: "int",
        out_feature: "int",
        fast_text_lang: 'str' = 'id',
        fast_text_pad_sequence: 'bool' = True,
        conv_num_filters: int = 100, 
        conv_kernels: "tuple[int]" = (3,4,5),
        feed_forward_dropout: 'float' = 0.5,
    ) -> None:
        super().__init__()

        # Config
        self.seq_length = seq_length
        self.fast_text_pad_sequence = fast_text_pad_sequence
        
        # Layer
        self.fasttext = FastText(fast_text_lang)

        self.conv_layer = nnlp.CNNEncoder(300, conv_num_filters, conv_kernels)
        
        self.classification_head = nn.Sequential(
            nn.Dropout(feed_forward_dropout),
            nn.Linear(len(conv_kernels) * conv_num_filters, out_feature),
        )

        # Scorer
        self.f1_scorer = F1Score(task="multiclass", num_classes=out_feature)
        self.accuracy_scorer = Accuracy(task="multiclass", num_classes=out_feature)
        self.precision_scorer = Precision(task="multiclass", num_classes=out_feature)
        self.recall_scorer = Recall(task="multiclass", num_classes=out_feature)

    def _forward_fasttext(self, x: "list[str]"):
        batch_text_embedding = torch.tensor([]).to(self.device)
        for sentence in x:
            sentence_seq = sentence.split(" ")
            if len(sentence_seq) > self.seq_length:
                sentence_seq = sentence_seq[: self.seq_length]
            if self.fast_text_pad_sequence:
                while len(sentence_seq) < self.seq_length:
                    sentence_seq.append("<PAD>")
            word_embedding = (
                self.fasttext[sentence_seq].unsqueeze(0).to(self.device)
            )
            batch_text_embedding = torch.cat((batch_text_embedding, word_embedding))
        return batch_text_embedding

    def forward(self, x: "list[str]"):
        x = self._forward_fasttext(x)
        x = self.conv_layer(x)
        x = self.classification_head(x)

        return x

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("training_loss", loss, prog_bar=True)
        f1_score = self.f1_scorer(pred, y)
        self.log("training_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("training_accuracy", accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("validation_loss", loss, prog_bar=True)
        f1_score = self.f1_scorer(pred, y)
        self.log("validation_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("validation_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("validation_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("validation_recall", recall, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("test_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("test_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("test_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("test_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("test_recall", recall, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=hyper_params['learning_rate'])
        return optimizer

model_module = CNNClassifier
def create_model():
    return model_module(hyper_params['seq_length'], hyper_params['out_feature'], conv_num_filters=hyper_params['conv_num_filters'], conv_kernels=hyper_params['conv_kernels'])


In [9]:
import os
from tqdm import tqdm

for i in data:
    print(f"Processing scenario {i}")

    if not os.path.exists(out_path):
        os.mkdir(out_path)

    state_dict = torch.load(data[i]['model'], map_location="cpu")['state_dict']
    model = create_model()
    model.load_state_dict(state_dict)
    model.eval()
    text_data = data[i]['df']['tweet'].to_list()
    predictions = []
    
    with torch.no_grad():
        for text in tqdm(text_data):
            logits = model([f"{text}"])
            predictions.append(logits)

    predictions = torch.cat(predictions, 0)
    softmax_tensor = F.softmax(predictions)
    argmax_tensor = torch.argmax(softmax_tensor, 1)
    np_pred = argmax_tensor.cpu().numpy()

    df_pred = data[i]['df'].copy()
    df_pred['prediction'] = np_pred
    df_pred.to_csv(f"{out_path}/{i}.csv", index=False)

Processing scenario 1


100%|██████████| 381/381 [00:01<00:00, 285.18it/s]
  softmax_tensor = F.softmax(predictions)


Processing scenario 3


100%|██████████| 381/381 [00:01<00:00, 281.60it/s]
  softmax_tensor = F.softmax(predictions)


Processing scenario 5


100%|██████████| 381/381 [00:01<00:00, 282.43it/s]
  softmax_tensor = F.softmax(predictions)
