CNN1D for text classification.

Inaya Rahmanisa


References:
- Tutorial Conv1D and LSTM from scratch by Pak Alfan Farizki Wicaksono, Fasilkom UI
- https://cezannec.github.io/CNN_Text_Classification/

In [104]:
import torch
import pandas as pd
import numpy as np

from collections import Counter
from torch import nn, optim
from torch.utils.data import DataLoader
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self, sequence_length, documents, labels,
    ):
        self.sequence_length = sequence_length
        self.words = self.load_words(documents)
        self.uniq_words = self.get_uniq_words()

        # id vocab mulai dari 1, bukan 0; 0 untuk [PAD]
        # di-pad supaya length doc-nya sama semua
        self.index_to_word = {(index + 1): word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: (index + 1) for index, word in enumerate(self.uniq_words)}
        self.index_to_word[0] = "[PAD]"
        self.word_to_index["[PAD]"] = 0

        self.labels = labels
        self.docs = []
        for doc in documents:
            self.docs.append(self.to_ids(doc))

    def to_ids(self, doc):
        doc = [self.word_to_index[w] for w in self.tokenize(doc)]
        if len(doc) >= self.sequence_length:
            doc = doc[:self.sequence_length]
        else:
            doc += [0] * (self.sequence_length - len(doc))
        return doc

    def tokenize(self, text):
        return text.split(' ')

    def load_words(self, documents):
        text = ""
        for doc in documents:
          text += doc + " "
        return self.tokenize(text)

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, index):
        return (
            torch.tensor(self.docs[index]),
            torch.tensor(self.labels[index]),
        )

In [105]:
import torch
import torch.nn as nn
class Linear(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.W = nn.Parameter(torch.Tensor(self.n_inputs, self.n_outputs))
        self.init_weights()
    def init_weights(self):
        for param in self.parameters():
            nn.init.uniform_(param, -0.1, 0.1)
    def forward(self, x):
        return x @ self.W

In [106]:
class Conv1D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_width):
        """ kernel_width harus ganjil """
        super(Conv1D, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_width = kernel_width
        self.pad_size = (self.kernel_width - 1) // 2
        self.kernel = Linear(kernel_width * in_channels, out_channels) # ini yg menentukan size keluaran outputnya (karena size W tergantung sama output size)
    def forward(self, x):
        # padding untuk convolution
        x = nn.functional.pad(x, (self.pad_size, self.pad_size), "constant", 0)
        
        l = []
        # dilakukan sebanyak shape[2] - k + 1
        for i in range(self.pad_size, x.shape[2] - self.pad_size): # shape[2] ini shape dimensi paling dalam, contoh tadinya 5 setelah dipad jadi 7
            patch = x[:, :, i - self.pad_size: i + self.pad_size + 1] # kalau ini sliding patch nya dari awal (0) sampai kernel_width (3)
            # print(patch)
            patch = patch.reshape(x.shape[0], self.in_channels * self.kernel_width) # gabung (flatten patch) jadi ukurannya in_channel*kernel_width. x.shape[0] itu channel size, banyaknya matrix yang sama2 ukuran in_channel*kernel_width
            # print(patch)
            l.append(self.kernel(patch))
           
            
        return torch.stack(l, dim=2)


In [107]:
class CNNNet(nn.Module):

    def __init__(self, vocab_size, input_channel, embedding_dim, hidden_dim, output_dim):

        super(CNNNet,self).__init__()

        # Embedding layer
        # padding_idx (int, optional) – If specified, the entries at padding_idx do
        # not contribute to the gradient; therefore, the embedding vector at
        # padding_idx is not updated during training, i.e. it remains as a fixed “pad”.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # CNN layer process the vector sequences
        self.cnn = Conv1D(input_channel, output_dim, 3)

        # Dense layer to predict
        self.fc = nn.Linear(hidden_dim, 1) # ouput_dim is 1 for text classification

        # Prediction activation function
        self.sigmoid = nn.Sigmoid()


    def forward(self, text):
        embedded = self.embedding(text)

        
        output = self.cnn(embedded)
        output = torch.mean(output, dim=1) #mean pooling
        # output = torch.max(output, dim=1) #max pooling as used by cezannec ref
        # print(f"output shape after mean {output.shape}")
        output = self.fc(output)
        output = self.sigmoid(output)
        return output

In [108]:
def train(dataset, model, batch_size, max_epochs):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    print(len(dataset.index_to_word))
    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

In [109]:
documents = ["buku bagus rapih cerdas dan menarik",
             "rumah rapih cantik dan bersih",
             "hotel kotor berisik dan bau",
             "kantin jorok kotor mahal dan panas"]
labels = [[1.], [1.], [0.], [0.]]
seq_length = 8
dataset = Dataset(seq_length, documents, labels)
print(dataset.index_to_word)
model = CNNNet(len(dataset.index_to_word), seq_length, 16, 16, 3)
train(dataset, model, 2, max_epochs=200)

{1: 'dan', 2: 'rapih', 3: 'kotor', 4: 'buku', 5: 'bagus', 6: 'cerdas', 7: 'menarik', 8: 'rumah', 9: 'cantik', 10: 'bersih', 11: 'hotel', 12: 'berisik', 13: 'bau', 14: 'kantin', 15: 'jorok', 16: 'mahal', 17: 'panas', 18: '', 0: '[PAD]'}
19
{'epoch': 0, 'batch': 0, 'loss': 0.6965881586074829}
{'epoch': 0, 'batch': 1, 'loss': 0.6845444440841675}
{'epoch': 1, 'batch': 0, 'loss': 0.6910114288330078}
{'epoch': 1, 'batch': 1, 'loss': 0.6828407049179077}
{'epoch': 2, 'batch': 0, 'loss': 0.6869640350341797}
{'epoch': 2, 'batch': 1, 'loss': 0.6808914542198181}
{'epoch': 3, 'batch': 0, 'loss': 0.6832168698310852}
{'epoch': 3, 'batch': 1, 'loss': 0.6788530349731445}
{'epoch': 4, 'batch': 0, 'loss': 0.6795915365219116}
{'epoch': 4, 'batch': 1, 'loss': 0.6767730712890625}
{'epoch': 5, 'batch': 0, 'loss': 0.6760233640670776}
{'epoch': 5, 'batch': 1, 'loss': 0.6746698617935181}
{'epoch': 6, 'batch': 0, 'loss': 0.6724791526794434}
{'epoch': 6, 'batch': 1, 'loss': 0.6725500822067261}
{'epoch': 7, 'batch

In [110]:
# prediction
model.eval()

with torch.no_grad():
  sent = "hotel dan kantin rapih bersih menarik dan bagus"
  sent = torch.tensor([dataset.to_ids(sent)])
  print(model(sent))

tensor([[0.8647]])
