1. Tokenize the given text data with some off-the-shelf software.
2. Build the vocabulary (like the dictionary object in python) to map the token into some unique ID.
3. Select the pretrained embedding (Glove, Fasttext, Word2vec) as the initialization of your embedding layer. (Not necessary, but recommended)
4. Construct your transformer model and finally end up with some simple feed forward module.
5. Choose the suitable optimizer (Adam might be not suitable) and activation function (ReLU might be not suitable. Try Tanh or Swish?)
6. Try some tricks like learning rate scheduler.
7. Check some tutorial such as [Here](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html).

Given the headline and the content of the news, you need to train a model to correctly classify the news into 4 different category:
1. Sports
2. Business
3. Tech
4. Media

train.csv contains 4 columns:
id,category,headline,short_description

test.csv contains 3 columns:
id,headline,short_description

submission.csv contains 2 columns:
id,category

In [104]:
import torch
import csv
import numpy as np
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

# read train.csv
with open('train.csv', newline='') as csvfile:
    rows = csv.reader(csvfile)
    data = []
    for row in rows:
        data.append(row)
    data = data[1:]

# read test.csv
with open('test.csv', newline='') as csvfile:
    rows = csv.reader(csvfile)
    test = []
    for row in rows:
        test.append(row)
    test = test[1:]

# split train.csv into train and valid
train = data[:int(len(data)*0.8)]
valid = data[int(len(data)*0.8):]

# get the category of train, valid, and test
train_category = [row[1] for row in train]
valid_category = [row[1] for row in valid]
test_category = [row[1] for row in test]

# get the headline and short_description of train, valid, and test
train_headline = [row[2] for row in train]
valid_headline = [row[2] for row in valid]
test_headline = [row[1] for row in test]
train_short_description = [row[3] for row in train]
valid_short_description = [row[3] for row in valid]
test_short_description = [row[2] for row in test]

In [105]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# remove stopwords
stop_words = set(stopwords.words('english'))
train_headline = [[word for word in headline.split() if word not in stop_words] for headline in train_headline]
valid_headline = [[word for word in headline.split() if word not in stop_words] for headline in valid_headline]
test_headline = [[word for word in headline.split() if word not in stop_words] for headline in test_headline]
train_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in train_short_description]
valid_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in valid_short_description]
test_short_description = [[word for word in short_description.split() if word not in stop_words] for short_description in test_short_description]

[nltk_data] Downloading package stopwords to /home/mllab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
# combine headline and short_description
train_data = [train_headline[i] + train_short_description[i] for i in range(len(train_headline))]
valid_data = [valid_headline[i] + valid_short_description[i] for i in range(len(valid_headline))]
test_data = [test_headline[i] + test_short_description[i] for i in range(len(test_headline))]

In [107]:
# build vocabulary
tokenizer = get_tokenizer('basic_english')

train_vocab = build_vocab_from_iterator(map(tokenizer, train_data), specials=["<unk>"])
valid_vocab = build_vocab_from_iterator(map(tokenizer, valid_data), specials=["<unk>"])

train_vocab.set_default_index(train_vocab["<unk>"])
valid_vocab.set_default_index(valid_vocab["<unk>"])

train_text_pipeline = lambda x: train_vocab(tokenizer(x))
valid_text_pipeline = lambda x: valid_vocab(tokenizer(x))

train_label_pipeline = lambda x: int(x) - 1
valid_label_pipeline = lambda x: int(x) - 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_collate_fn(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(train_label_pipeline(_label))
        processed_text = torch.tensor(train_text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def valid_collate_fn(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(valid_label_pipeline(_label))
        processed_text = torch.tensor(valid_text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


train_dataloader = DataLoader(train, batch_size=64, shuffle=True, collate_fn=train_collate_fn)
valid_dataloader = DataLoader(valid, batch_size=64, shuffle=True, collate_fn=valid_collate_fn)
    

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


TypeError: build_vocab_from_iterator() got an unexpected keyword argument 'specials'