1. Tokenize the given text data with some off-the-shelf software.
2. Build the vocabulary (like the dictionary object in python) to map the token into some unique ID.
3. Select the pretrained embedding (Glove, Fasttext, Word2vec) as the initialization of your embedding layer. (Not necessary, but recommended)
4. Construct your transformer model and finally end up with some simple feed forward module.
5. Choose the suitable optimizer (Adam might be not suitable) and activation function (ReLU might be not suitable. Try Tanh or Swish?)
6. Try some tricks like learning rate scheduler.
7. Check some tutorial such as [Here](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html).

Given the headline and the content of the news, you need to train a model to correctly classify the news into 4 different category:
1. Sports
2. Business
3. Tech
4. Media

train.csv contains 4 columns:
id,category,headline,short_description

test.csv contains 3 columns:
id,headline,short_description

submission.csv contains 2 columns:
id,category

In [1]:
import torch
import csv
import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import warnings
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
import tqdm
from nltk.tokenize.treebank import TreebankWordDetokenizer
import nltk
import csv
from nltk.corpus import stopwords
from torch.nn.utils.rnn import pad_sequence
import numpy as np

warnings.filterwarnings("ignore")

# # read train.csv
# with open('train.csv', newline='') as csvfile:
#     rows = csv.reader(csvfile)
#     data = []
#     for row in rows:
#         data.append(row)
#     data = data[1:]

# # read test.csv
# with open('test.csv', newline='') as csvfile:
#     rows = csv.reader(csvfile)
#     test = []
#     for row in rows:
#         test.append(row)
#     test = test[1:]

# # split train.csv into train and valid
# train = data[:int(len(data)*0.8)]
# valid = data[int(len(data)*0.8):]

# # get the category of train, valid, and test
# train_category = [row[1] for row in train]
# valid_category = [row[1] for row in valid]

# # get the headline and short_description of train, valid, and test
# train_headline = [row[2] for row in train]
# valid_headline = [row[2] for row in valid]
# test_headline = [row[1] for row in test]
# train_short_description = [row[3] for row in train]
# valid_short_description = [row[3] for row in valid]
# test_short_description = [row[2] for row in test]

In [2]:
nltk.download('stopwords')

# remove stopwords
# stop_words = set(stopwords.words('english'))
# train_headline = [[word for word in headline.split() if word not in stop_words] for headline in train_headline]
# valid_headline = [[word for word in headline.split() if word not in stop_words] for headline in valid_headline]
# test_headline = [[word for word in headline.split() if word not in stop_words] for headline in test_headline]
# train_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in train_short_description]
# valid_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in valid_short_description]
# test_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in test_short_description]

[nltk_data] Downloading package stopwords to /home/mllab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# combine headline and short_description
# train_text = [train_headline[i] + train_short_description[i] for i in range(len(train_headline))]
# valid_text = [valid_headline[i] + valid_short_description[i] for i in range(len(valid_headline))]
# test_text = [test_headline[i] + test_short_description[i] for i in range(len(test_headline))]

In [4]:
detokenizer = TreebankWordDetokenizer()

# detokenize
# train_text = [detokenizer.detokenize(text) for text in train_text]
# valid_text = [detokenizer.detokenize(text) for text in valid_text]
# test_text = [detokenizer.detokenize(text) for text in test_text]

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, text, category):
        self.text = text
        self.category = category

    def __getitem__(self, idx):
        return self.text[idx], self.category[idx]

    def __len__(self):
        return len(self.text)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, text):
        self.text = text

    def __getitem__(self, idx):
        return self.text[idx]

    def __len__(self):
        return len(self.text)

In [6]:
# read train.csv with pandas
import pandas as pd
df = pd.read_csv('train.csv')
# combine headline and short_description into text, and remove idx
df['text'] = df['headline'] + ' ' + df['short_description']
# build dataset with text and category
dataset = MyDataset(df['text'], df['category'])
# split dataset into train and valid with random_split
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

In [7]:
test_df = pd.read_csv('test.csv')
test_df['text'] = test_df['headline'] + ' ' + test_df['short_description']
test_dataset = TestDataset(test_df['text'])

In [8]:
text_vec = GloVe(name='6B', dim=100)

In [9]:
tokenizer = get_tokenizer('basic_english')

text_pipeline = lambda x: text_vec.get_vecs_by_tokens(tokenizer(x), lower_case_backup=False)

label_pipeline = lambda x: int(x) - 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, padding_value=0)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return label_list.to(device), text_list.to(device)

def test_collate_batch(batch):
    text_list = []
    for (_text) in batch:
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, padding_value=0)
    return text_list.to(device)

class TransformerModel(nn.Module):

    def __init__(self, num_class: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, activatioln: str = 'relu') -> None:
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout,)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding.from_pretrained(text_vec.vectors, freeze=False)
        self.decoder = nn.Linear(d_model, num_class)
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output=torch.mean(output,dim=0)
        output = self.decoder(output)
        return output

def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
batch = 16

train_dataloader = DataLoader(train_dataset, batch_size=batch, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=test_collate_batch)

In [14]:
# show the content of train_dataloader
for i, (label, text) in enumerate(train_dataloader):
    print(label)
    print(text)
    break

ValueError: cannot convert float NaN to integer

In [12]:
num_class = 4
emsize = 100  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 4  # number of heads in ``nn.MultiheadAttention``
dropout = 0.25  # dropout probability
total_epoch = 200
act_funct = 'silu'

model = TransformerModel(num_class, emsize, nhead, d_hid, nlayers, dropout, activatioln=act_funct).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss().to(device)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_epoch, eta_min=1e-5)

In [13]:
best_acc, best_loss = 0, 1e9

for i in range(total_epoch):
    train_acc, train_loss = 0, 0
    valid_acc, valid_loss = 0, 0

    model.train()
    train_loop = tqdm.tqdm((train_dataloader), total=len(train_dataloader))
    for idx, (label, text) in enumerate(train_loop):
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = torch.argmax(F.softmax(output, dim=1) ,dim=1)
        # print("train_pred", pred)
        # print("train_label", label)
        train_acc += (pred == label).sum().item()
        train_loop.set_description(f"Epoch [{i+1}/{total_epoch}]")
        train_loop.set_postfix(train_loss=train_loss/(len(train_dataloader) * batch), train_acc=train_acc/(len(train_dataloader) * batch))

    model.eval()
    total_loss = 0.
    valid_loop = tqdm.tqdm((valid_dataloader), total=len(valid_dataloader))
    with torch.no_grad():
        for idx, (label, text) in enumerate(valid_loop):
            output = model(text)
            loss = criterion(output, label)
            valid_loss += loss.item()
            pred = torch.argmax(F.softmax(output, dim=1), dim=1)
            # print("valid_pred", pred)
            # print("valid_label", label)
            valid_acc += (pred == label).sum().item()
            valid_loop.set_description(f"Epoch [{i+1}/{total_epoch}]")
            valid_loop.set_postfix(valid_loss=valid_loss/(len(valid_dataloader) * batch), valid_acc=valid_acc/(len(valid_dataloader) * batch))

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), 'best_loss_model.pth')
    scheduler.step()

  0%|          | 0/100 [00:00<?, ?it/s]


ValueError: invalid literal for int() with base 10: "Uber, Reeling From Controversies, Loses Another Executive Uber's head of communications leaves as the bad publicity piles up."

In [None]:
# load best model
model.load_state_dict(torch.load('best_loss_model.pth'))

pred = np.zeros(len(test_dataloader))
index = 0

test_loop = tqdm.tqdm((test_dataloader), total=len(test_dataloader))
for idx, (text) in enumerate(test_loop):
    with torch.no_grad():
        output = model(text)
        pred[idx] = torch.argmax(F.softmax(output, dim=1), dim=1) + 1
    index += 1

with open('result.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'category'])
    for i in range(len(pred)):
        writer.writerow([i+1, int(pred[i]) + 1])