In [None]:
# # Prepare Google Colab Environement and build handmade library
# !git clone https://github.com/kaenova/Headline_Detection.git
# %cd "/content/Headline_Detection"

# !make lib

# %cd "/content/"

# print("Please upload '4. Processed.zip'")
# from google.colab import files
# files.upload()

# !unzip "/content/4. Processed.zip"

# print("Please upload .env")
# from google.colab import files
# files.upload()

In [None]:
# # Reset Google Colab Environment
# %cd "/content/"
# !rm -fr Headline_Detection

# Load Config and Env

In [1]:
# Load env for S3 uploads
from dotenv import load_dotenv
from kaelib.s3 import check_required_env_vars
import os
load_dotenv()
check_required_env_vars()

# Load Dataset

In [None]:
import os
import pandas as pd

environment = 'local'
if environment == 'local':
    data_path = "../../data/4. Processed/"
elif environment == 'colab':
    data_path = "/content/4. Processed/"
else:
    raise ImportError("Data environment")


folders = os.listdir(data_path)
scenario_data = {}

for folder in folders:
    current_data_path = f"{data_path}/{folder}"
    if not os.path.isdir(current_data_path):
        continue
    files = os.listdir(current_data_path)
    data_dict = {}
    for file in files:
        current_file = f"{current_data_path}/{file}"
        data_dict[file] = pd.read_csv(current_file)
    scenario_data[int(folder)] = data_dict

In [None]:
import torch
import pandas
import typing
from torch.utils.data import Dataset
from kaelib.processor import TextProcessingPipeline


class TextClassificationDataset(Dataset):
    def __init__(
        self,
        df: "pandas.DataFrame",
        x_column_name: "str" = "tweet",
        y_column_name: "str" = "labels",
        preprocessor: "typing.Optional[TextProcessingPipeline]" = None,
    ):
        self.x = df[x_column_name].astype(str).to_list()
        self.y = torch.tensor(df[y_column_name].astype(int).to_list())
        assert len(self.x) == len(self.y)
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.x)

    def _process_idx_text(self, idx):
        data = self.x[idx]
        if type(idx) is not slice:
            data = [self.x[idx]]
        if self.preprocessor is not None:
            data = self.preprocessor.process_corpus(data)
        return data

    def __getitem__(self, idx):
        processed_corpus = self._process_idx_text(idx)
        return processed_corpus, self.y[idx]

    def __repr__(self) -> str:
        return "\n".join([f"{self.x[i]} : {self.y[i]}" for i in range(5)])


scenario_datasets = {}
for scenario in scenario_data:
    data = scenario_data[scenario]
    datasets = {}
    for data_type in data:
        datasets[data_type] = TextClassificationDataset(data[data_type])
    scenario_datasets[scenario] = datasets

# Modelling

In [None]:
# Hyperparameters
hyper_params = {
    'seq_length': 256,
    'learning_rate': 1e-5,
    'batch_size': 32
}

In [None]:
# https://ieeexplore.ieee.org/document/8577620
# https://ieeexplore.ieee.org/abstract/document/8589431
# https://ieeexplore.ieee.org/document/8691381
# https://aclanthology.org/D14-1181.pdf
# https://arxiv.org/abs/2304.03208

import math
import torch
import torch.nn as nn
import torchnlp.nn as nnlp
import lightning.pytorch as pl
import torch.nn.functional as F

from torch.optim.lr_scheduler import ExponentialLR
from torchnlp.word_to_vector import FastText
from torchmetrics.classification import F1Score, Accuracy, Recall, Precision

class CNNClassifier(pl.LightningModule):
    def __init__(
        self,
        seq_length: "int",
        out_feature: "int",
        fast_text_lang: 'str' = 'id',
        fast_text_pad_sequence: 'bool' = True,
        conv_num_filters: int = 100, 
        conv_kernels: "tuple[int]" = (3,4,5),
        feed_forward_dropout: 'float' = 0.5,
    ) -> None:
        super().__init__()

        # Config
        self.seq_length = seq_length
        self.fast_text_pad_sequence = fast_text_pad_sequence
        
        # Layer
        self.fasttext = FastText(fast_text_lang)

        self.conv_layer = nnlp.CNNEncoder(300, conv_num_filters, conv_kernels)
        
        self.classification_head = nn.Sequential(
            nn.Dropout(feed_forward_dropout),
            nn.Linear(len(conv_kernels) * conv_num_filters, out_feature),
        )

        # Scorer
        self.f1_scorer = F1Score(task="multiclass", num_classes=out_feature)
        self.accuracy_scorer = Accuracy(task="multiclass", num_classes=out_feature)
        self.precision_scorer = Precision(task="multiclass", num_classes=out_feature)
        self.recall_scorer = Recall(task="multiclass", num_classes=out_feature)

    def _forward_fasttext(self, x: "list[str]"):
        batch_text_embedding = torch.tensor([]).to(self.device)
        for sentence in x:
            sentence_seq = sentence.split(" ")
            if len(sentence_seq) > self.seq_length:
                sentence_seq = sentence_seq[: self.seq_length]
            if self.fast_text_pad_sequence:
                while len(sentence_seq) < 256:
                    sentence_seq.append("<PAD>")
            word_embedding = (
                self.fasttext[sentence_seq].unsqueeze(0).to(self.device)
            )
            batch_text_embedding = torch.cat((batch_text_embedding, word_embedding))
        return batch_text_embedding

    def forward(self, x: "list[str]"):
        x = self._forward_fasttext(x)
        x = self.conv_layer(x)
        x = self.classification_head(x)

        return x

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("training_loss", loss, prog_bar=True)
        f1_score = self.f1_scorer(pred, y)
        self.log("training_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("training_accuracy", accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("validation_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("validation_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("validation_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("validation_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("validation_recall", recall, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("test_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("test_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("test_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("test_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("test_recall", recall, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=hyper_params['learning_rate'])
        return optimizer



In [None]:
model_module = CNNClassifier
def create_model():
    return model_module(hyper_params['seq_length'], 2)

In [None]:
model = create_model()
model(['hello my name', 'hello are you']).shape

# Train, Val, Test

In [2]:
available_cpus = 0
model_name = "cnn"

In [None]:
import datetime
import random
import torch
import numpy as np

from torch.utils.data import DataLoader
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint

now = datetime.datetime.now()
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')

for scenario in scenario_datasets:
    np.random.seed(2023)
    random.seed(2023)
    torch.manual_seed(2023)
    
    run_id = f"{timestamp}_scenario_{scenario}"
    
    if scenario == 0:
        continue

    model = create_model()

    # Dataset
    train_dataset = scenario_datasets[scenario]["train.csv"]
    test_dataset = scenario_datasets[scenario]["test.csv"]
    validation_dataset = scenario_datasets[scenario]["validation.csv"]
    train_loader = DataLoader(train_dataset, batch_size=hyper_params['batch_size'], num_workers=available_cpus)
    validation_loader = DataLoader(validation_dataset, batch_size=hyper_params['batch_size'], num_workers=available_cpus)
    test_loader = DataLoader(test_dataset, batch_size=hyper_params['batch_size'], num_workers=available_cpus)
    
    tb_log = TensorBoardLogger("tensorboard", model_name, run_id)
    tb_log.log_hyperparams(hyper_params)
    
    checkpoint_callback_val = ModelCheckpoint(monitor="validation_loss", filename='val_{epoch}-{validation_loss:.2f}')
    checkpoint_callback_train = ModelCheckpoint(monitor="training_loss", filename='train_{epoch}-{training_loss:.2f}')
    early_stop_callback = EarlyStopping(
        monitor="validation_loss", min_delta=1e-3, patience=5, mode="min"
    )

    trainer = pl.Trainer(logger=tb_log, callbacks=[early_stop_callback, checkpoint_callback_val, checkpoint_callback_train])
    trainer.fit(
        model=model, train_dataloaders=train_loader, val_dataloaders=validation_loader
    )
    print(run_id)
    trainer.test(model=model, dataloaders=test_loader)


# Upload Logging and Checkpoint to S3

In [3]:
import os
from kaelib.s3_threading import upload_folder_to_s3_threads_workers
s3_folders = os.path.join(os.getenv("BUCKET_FOLDER"), model_name)
upload_folder_to_s3_threads_workers(f"tensorboard/{model_name}", s3_folders)

Uploading: 100%|██████████| 63/63 [01:27<00:00,  1.40s/file, file=Log and Checkpoints/cnn/2023-04-17_21-18-37_scenario_6/checkpoints/val_epoch=0-validation_loss=0.68.ckpt] 

Upload completed successfully!



