In [5]:
model = "fasttext"

out_path = f"pred/{model}/"

data = {
    1 : {
        "model": "../tensorboard_checkpoint/fasttext/2023-04-25_08-31-02_scenario_1/checkpoints/val_epoch=53-validation_loss=0.2726.ckpt",
        "dataset":"original/1.csv",
    },
    3 : {
        "model": "../tensorboard_checkpoint/fasttext/2023-04-25_08-31-02_scenario_3/checkpoints/val_epoch=50-validation_loss=0.2775.ckpt",
        "dataset":"original/3.csv",
    },
    5 : {
        "model": "../tensorboard_checkpoint/fasttext/2023-04-25_08-31-02_scenario_5/checkpoints/val_epoch=48-validation_loss=0.2798.ckpt",
        "dataset":"original/5.csv",
    },
}

In [6]:
# Load dataset to df
import pandas as pd
for i in data:
    data[i]['df'] = pd.read_csv(data[i]['dataset'])

In [7]:
# Prepare model

# Hyperparameters
hyper_params = {
    'seq_length': 256,
    'out_feature': 2,
    'learning_rate': 8e-5,
    'batch_size': 64
}

# https://arxiv.org/pdf/1607.01759.pdf
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchnlp.word_to_vector import FastText
import lightning.pytorch as pl
from torchmetrics.classification import F1Score, Accuracy, Recall, Precision


class FastTextClassifier(pl.LightningModule):
    def __init__(
        self,
        seq_length: "int" = 256,
        out_feature: "int" = 2,
        pad_sequence: "bool" = False,
    ):
        super().__init__()
        self.seq_length = seq_length
        self.pad_sequence = pad_sequence
        self.fasttext = FastText("id")
        self.feed_forward = nn.Sequential(
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 64),
            nn.ReLU(),
            nn.Linear(64, out_feature),
        )
        self.f1_scorer = F1Score(task="multiclass", num_classes=out_feature)
        self.accuracy_scorer = Accuracy(task="multiclass", num_classes=out_feature)
        self.precision_scorer = Precision(task="multiclass", num_classes=out_feature)
        self.recall_scorer = Recall(task="multiclass", num_classes=out_feature)

    def _forward_fasttext(self, x: "list[str]"):
        batch_text_embedding = torch.tensor([]).to(self.device)
        for sentence in x:
            sentence_seq = sentence.split(" ")
            if len(sentence_seq) > self.seq_length:
                sentence_seq = sentence_seq[: self.seq_length]
            if self.pad_sequence:
                while len(sentence_seq) < 256:
                    sentence_seq.append("<pad>")
            word_embedding = (
                self.fasttext[sentence_seq].mean(dim=0).unsqueeze(0).to(self.device)
            )
            batch_text_embedding = torch.cat((batch_text_embedding, word_embedding))
        return batch_text_embedding

    def forward(self, x: "list[str]") -> "torch.Tensor":
        # Prepare str
        logits: torch.Tensor = self._forward_fasttext(x)
        logits = self.feed_forward(logits)
        return logits

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("training_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("training_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("training_accuracy", accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("validation_loss", loss, prog_bar=True)
        f1_score = self.f1_scorer(pred, y)
        self.log("validation_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("validation_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("validation_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("validation_recall", recall, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("test_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("test_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("test_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("test_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("test_recall", recall, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=hyper_params['learning_rate'])
        return optimizer

model_module = FastTextClassifier
def create_model():
    return model_module(hyper_params['seq_length'], hyper_params['out_feature'])

In [8]:
import os
from tqdm import tqdm

for i in data:
    print(f"Processing scenario {i}")

    if not os.path.exists(out_path):
        os.mkdir(out_path)

    state_dict = torch.load(data[i]['model'], map_location="cpu")['state_dict']
    model = create_model()
    model.load_state_dict(state_dict)
    model.eval()
    text_data = data[i]['df']['tweet'].to_list()
    predictions = []
    
    with torch.no_grad():
        for text in tqdm(text_data):
            logits = model([f"{text}"])
            predictions.append(logits)

    predictions = torch.cat(predictions, 0)
    softmax_tensor = F.softmax(predictions)
    argmax_tensor = torch.argmax(softmax_tensor, 1)
    np_pred = argmax_tensor.cpu().numpy()

    df_pred = data[i]['df'].copy()
    df_pred['prediction'] = np_pred
    df_pred.to_csv(f"{out_path}/{i}.csv", index=False)

Processing scenario 1


100%|██████████| 381/381 [00:00<00:00, 2048.41it/s]
  softmax_tensor = F.softmax(predictions)


Processing scenario 3


100%|██████████| 381/381 [00:00<00:00, 2822.27it/s]
  softmax_tensor = F.softmax(predictions)


Processing scenario 5


100%|██████████| 381/381 [00:00<00:00, 2591.76it/s]
  softmax_tensor = F.softmax(predictions)
