# Sentiment Classification

The task is from [aidea](https://aidea-web.tw/topic/c4a666bb-7d83-45a6-8c3b-57514faf2901), the goal is to predict the sentiment of each article.

In [12]:
import torch
import numpy as np
import pandas as pd

## Data Preparation

In [18]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [5]:
train_df

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0
...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1
29337,453,An absorbing and unsettling psychological drama .,1
29338,13097,"Soylent Green IS...a really good movie, actual...",1
29339,26896,There just isn't enough here. There a few funn...,0


In [8]:
train_df.describe()

Unnamed: 0,ID,sentiment
count,29341.0,29341.0
mean,29348.411097,0.509662
std,17002.074346,0.499915
min,4.0,0.0
25%,14564.0,0.0
50%,29348.0,1.0
75%,44162.0,1.0
max,58681.0,1.0


In [9]:
train_df['sentiment'].value_counts()

1    14954
0    14387
Name: sentiment, dtype: int64

In [7]:
test_df

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
...,...,...
29336,30370,It is difficult to rate a writer/director's fi...
29337,18654,"After watching this movie once, it quickly bec..."
29338,47985,"Even though i sat and watched the whole thing,..."
29339,9866,Warning Spoilers following. Superb recreation ...


In [10]:
sample_df = pd.read_csv('../data/sample_submission.csv')

In [11]:
sample_df

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,1
2,17468,1
3,42579,1
4,701,1
...,...,...
29336,30370,1
29337,18654,1
29338,47985,1
29339,9866,1


### Data Split

In [43]:
from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=123) # Define the split - into 2 folds 
kf.get_n_splits(train_df) # returns the number of splitting iterations in the cross-validator

10

In [41]:
X.iloc[[0,1,2]]

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0


In [45]:
X, y = train_df, train_df['sentiment'].to_numpy()
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1 2 3 5 6 7 8 9] TEST: [4]
9 1
TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [0]
9 1
TRAIN: [0 1 2 3 4 5 6 8 9] TEST: [7]
9 1
TRAIN: [0 1 2 3 4 6 7 8 9] TEST: [5]
9 1
TRAIN: [0 1 2 3 4 5 6 7 9] TEST: [8]
9 1
TRAIN: [0 1 2 4 5 6 7 8 9] TEST: [3]
9 1
TRAIN: [0 2 3 4 5 6 7 8 9] TEST: [1]
9 1
TRAIN: [0 1 2 3 4 5 7 8 9] TEST: [6]
9 1
TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [9]
9 1
TRAIN: [0 1 3 4 5 6 7 8 9] TEST: [2]
9 1


### Preprocessing

In [47]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in train_df['review']:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [118]:
Vocab([i[0] for i in counter.most_common(10)])

TypeError: list indices must be integers or slices, not str

In [122]:
Vocab(Counter(dict(counter.most_common(10))))

<torchtext.vocab.Vocab at 0x140640a30>

In [49]:
len(counter)

102969

In [55]:
[vocab[i] for i in ['I', 'am', 'aaaaaaaaaaaaa', '<pad>']]

[0, 241, 0, 1]

In [65]:
def text_pipeline(X):
    if isinstance(X, list):
        return [[vocab[i] for i in tokenizer(text)] for text in X]
    return [vocab[i] for i in tokenizer(X)]

In [68]:
text_pipeline("I am a good boy! ADJISDAKSD unkqwjs <pad> <pad>")

[13, 241, 5, 57, 412, 36, 0, 0, 1, 1]

In [67]:
text_pipeline(["I am a good", "boy!"])

[[13, 241, 5, 57], [412, 36]]

### Data Iteration

In [255]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch, use_bag=False):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    if use_bag:
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
    else:
        offsets = torch.tensor(offsets[1:], dtype=torch.int64)
        text_list = pad_sequence(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

df = train_df.iloc[:100]
train_iter = list(zip(df['review'], df['sentiment']))
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [154]:
for i in dataloader:
    print(i[0].shape, i[1].shape, i[2].shape)
    print(i)

torch.Size([8]) torch.Size([513, 8]) torch.Size([8])
torch.Size([8]) torch.Size([513, 8]) torch.Size([8])
(tensor([0, 1, 0, 0, 0, 1, 0, 1]), tensor([[  13,   11,  197,  ...,   14,   13, 5130],
        [ 295,  130,   10,  ...,  213,   33, 1483],
        [  14,   29,   29,  ...,    9,    8,    3],
        ...,
        [   0,  272,    0,  ...,    0,    0,    0],
        [   0,  319,    0,  ...,    0,    0,    0],
        [   0,    3,    0,  ...,    0,    0,    0]]), tensor([210, 513,  10, 233, 237, 276,  94, 309]))
torch.Size([8]) torch.Size([537, 8]) torch.Size([8])
torch.Size([8]) torch.Size([537, 8]) torch.Size([8])
(tensor([0, 1, 0, 1, 1, 1, 0, 0]), tensor([[   45,     2,    14,  ...,    51,  4700,    18],
        [    2,  3725,   373,  ...,    26,    12,  3034],
        [   87,    31,   119,  ...,   531,   732,    18],
        ...,
        [    0,     0, 85389,  ...,     0,     0,     0],
        [    0,     0,   373,  ...,     0,     0,     0],
        [    0,     0,     3,  ...,   

## Model Selection

In order to validate the performance of model, the frequently adopted solutions are cross validation and the usage of validation set. Since the size of training samples is small, cross validation would be a more appropriate strategy.

1. RNN-based model
2. Naive-bayes model
3. Bert model

### Baseline - MLP

In [87]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [88]:
train_iter = list(zip(train_df['review'], train_df['sentiment']))
num_class = len(set([label for (text, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [135]:
import time
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, F.one_hot(label, num_classes=num_class).type(torch.FloatTensor))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_count = 0
    all_preds, all_labels = list(), list()
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            predicted_label = predicted_label.argmax(1)
            all_preds += [predicted_label.detach().numpy()]
            all_labels += [label.detach().numpy()]
            total_count += label.size(0)
    print(all_preds, all_labels, total_count)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    prf = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    print(prf)
    return (all_preds == all_labels).mean()

In [136]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 1 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

# train_iter = list(zip(train_df.iloc[:1000]['review'], train_df.iloc[:1000]['sentiment']))

X, y = train_df.iloc[:1000]['review'], train_df['sentiment'].iloc[:1000].to_numpy()
# for train_index, test_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

# train_dataset = list(train_iter)
# test_dataset = list(test_iter)
# num_train = int(len(train_dataset) * 0.95)
# split_train_, split_valid_ = \
#     random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
#                              shuffle=True, collate_fn=collate_batch)
print(model)
total_acc = []
for idx, (train_index, valid_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, "TEST:", test_index)
    print(f"Cross validation {idx}-fold")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_iter = list(zip(X_train, y_train))
    valid_iter = list(zip(X_valid, y_valid))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    cross_acc = None
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        acc = evaluate(valid_dataloader)
        if cross_acc is not None and cross_acc > acc:
            scheduler.step()
        else:
            cross_acc = acc
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                                acc))
        print('-' * 59)
    total_acc += [cross_acc]
print(total_acc)
print(np.mean(total_acc))

TextClassificationModel(
  (embedding): EmbeddingBag(102971, 64, mode=mean)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)
Cross validation 0-fold
[array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1]), array([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0])] [array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1]), array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0])] 100
(0.6933333333333334, 1.0, 0.8188976377952756, None)
-----------------------------------------------------------
| end of epoch   1 | time:  0.45s | valid ac

### 1. RNN-based model

In [256]:
import torch
from torch import nn
from torch.nn import functional as F


class Attention(nn.Module):
    # target is hidden_size
    def __init__(self, hidden_size, method='concat'):
        super(Attention, self).__init__()
        self.method = method
        if method not in ('dot', 'general', 'concat'):
            raise NotImplemented
        if method == 'general':
            self.attn = nn.Linear(hidden_size, hidden_size)
        elif method == 'concat':
            self.attn = nn.Linear(2 * hidden_size, hidden_size)
            self.v = nn.Linear(hidden_size, 1, bias=False)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        if hasattr(self, 'attn'):
            self.attn.weight.data.uniform_(-initrange, initrange)
            self.attn.bias.data.zero_()
        if hasattr(self, 'v'):
            self.v.weight.data.uniform_(-initrange, initrange)

    def dot_score(self, hidden, encoder_output):
        return torch.matmul(hidden, encoder_output)

    def general_score(self, hidden, encoder_output):
        attn = self.attn(encoder_output)
        return torch.matmul(hidden, attn)

    def concat_score(self, hidden, encoder_output):
        hidden_reshape = torch.unsqueeze(hidden, dim=0).repeat(encoder_output.size(0), 1, 1)
        attn = self.attn(torch.cat([hidden_reshape, encoder_output], dim=-1)).tanh()
        return self.v(attn).squeeze(dim=-1)

    def forward(self, hidden, encoder_output):
        # output = [lengths x batch_size x hidden_size]
        # hidden = [batch_size x hidden_size]
        attn_scores = None
        if self.method == 'dot':
            attn_scores = self.dot_score(hidden, encoder_output)
        elif self.method == 'general':
            attn_scores = self.general_score(hidden, encoder_output)
        elif self.method == 'concat':
            attn_scores = self.concat_score(hidden, encoder_output)

        # [lengths x batch_size] -> [batch_size x lengths]
        attn_scores = attn_scores.t()
        # return [batch_size x 1 x lengths]
        return F.softmax(attn_scores, dim=-1).unsqueeze(1)


In [257]:
t = torch.tensor([5,3,7,2,1])
sorted_t, idx = t.sort(descending=True)
print(sorted_t)
print(torch.gather(t, 0, torch.arange(0, idx.shape[0], dtype=torch.int64)))

tensor([7, 5, 3, 2, 1])
tensor([5, 3, 7, 2, 1])


In [261]:
from torch import nn
from torch.nn import init


def sort_sequence(inputs, lengths):
    sorted_lengths, sorted_idx = lengths.sort(descending=True)
    return inputs[sorted_idx], sorted_lengths, sorted_idx


class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, n_layers, dropout, num_classes, attention_mode, padding_idx=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, sparse=True, padding_idx=1)
        self.lstm = nn.LSTM(embed_size, hidden_size, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.attn = Attention(2 * hidden_size, attention_mode)
        self.fc = nn.Linear(2 * hidden_size, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for param in self.lstm.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.uniform_(param.data, -initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, text_lengths, hidden=None):
        sorted_lengths, sorted_idx = text_lengths.sort(descending=True)
        sorted_text = torch.index_select(text, -1, sorted_idx)
        emb = self.embedding(sorted_text)
        packed = nn.utils.rnn.pack_padded_sequence(emb, sorted_lengths)
        outputs, hidden = self.lstm(packed, hidden)
        hidden_state, _ = hidden
        hidden_state = hidden_state[-2:,:,:].view(1, -1, 2 * hidden_size).squeeze(0)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        attn_weights = self.attn(hidden_state, outputs)
        # attn_weights = [batch_size x 1 x lengths]
        context = torch.bmm(attn_weights, outputs.transpose(0, 1)).squeeze(1)
        output = self.fc(context)
        output = torch.index_select(output, 0, torch.arange(0, sorted_idx.shape[0], dtype=torch.int64))
        return output, hidden

In [262]:
embed_size = 50
hidden_size = 256
n_layers = 2
dropout = 0.1
num_classes = 2
attention_mode = 'concat'
model = LSTMModel(vocab_size, embed_size, hidden_size, n_layers, 
                  dropout, num_classes, attention_mode).to(device)

In [264]:
import time
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label, _ = model(text, offsets)
        loss = criterion(predited_label, F.one_hot(label, num_classes=num_class).type(torch.FloatTensor))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_count = 0
    all_preds, all_labels = list(), list()
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label, _ = model(text, offsets)
            predicted_label = predicted_label.argmax(1)
            all_preds += [predicted_label.detach().numpy()]
            all_labels += [label.detach().numpy()]
            total_count += label.size(0)
    print(all_preds, all_labels, total_count)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    prf = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    print(prf)
    return (all_preds == all_labels).mean()

In [None]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 1 # epoch
LR = 5  # learning rate
BATCH_SIZE = 32 # batch size for training

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

X, y = train_df.iloc[:1000]['review'], train_df['sentiment'].iloc[:1000].to_numpy()
print(model)
total_acc = []
for idx, (train_index, valid_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, "TEST:", test_index)
    print(f"Cross validation {idx}-fold")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_iter = list(zip(X_train, y_train))
    valid_iter = list(zip(X_valid, y_valid))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    cross_acc = None
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        acc = evaluate(valid_dataloader)
        if cross_acc is not None and cross_acc > acc:
            scheduler.step()
        else:
            cross_acc = acc
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                                acc))
        print('-' * 59)
    total_acc += [cross_acc]
print(total_acc)
print(np.mean(total_acc))

LSTMModel(
  (embedding): Embedding(102971, 50, padding_idx=1, sparse=True)
  (lstm): LSTM(50, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (attn): Attention(
    (attn): Linear(in_features=1024, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
  (fc): Linear(in_features=512, out_features=2, bias=True)
)
Cross validation 0-fold
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])


## Evaluation