# Sentiment Classification

The task is from [aidea](https://aidea-web.tw/topic/c4a666bb-7d83-45a6-8c3b-57514faf2901), the goal is to predict the sentiment of each article.

In [12]:
import torch
import numpy as np
import pandas as pd

## Data Preparation

In [18]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [5]:
train_df

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0
...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1
29337,453,An absorbing and unsettling psychological drama .,1
29338,13097,"Soylent Green IS...a really good movie, actual...",1
29339,26896,There just isn't enough here. There a few funn...,0


In [8]:
train_df.describe()

Unnamed: 0,ID,sentiment
count,29341.0,29341.0
mean,29348.411097,0.509662
std,17002.074346,0.499915
min,4.0,0.0
25%,14564.0,0.0
50%,29348.0,1.0
75%,44162.0,1.0
max,58681.0,1.0


In [9]:
train_df['sentiment'].value_counts()

1    14954
0    14387
Name: sentiment, dtype: int64

In [7]:
test_df

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
...,...,...
29336,30370,It is difficult to rate a writer/director's fi...
29337,18654,"After watching this movie once, it quickly bec..."
29338,47985,"Even though i sat and watched the whole thing,..."
29339,9866,Warning Spoilers following. Superb recreation ...


In [10]:
sample_df = pd.read_csv('../data/sample_submission.csv')

In [11]:
sample_df

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,1
2,17468,1
3,42579,1
4,701,1
...,...,...
29336,30370,1
29337,18654,1
29338,47985,1
29339,9866,1


### Data Split

In [43]:
from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=123) # Define the split - into 2 folds 
kf.get_n_splits(train_df) # returns the number of splitting iterations in the cross-validator

10

In [41]:
X.iloc[[0,1,2]]

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0


In [45]:
X, y = train_df, train_df['sentiment'].to_numpy()
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1 2 3 5 6 7 8 9] TEST: [4]
9 1
TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [0]
9 1
TRAIN: [0 1 2 3 4 5 6 8 9] TEST: [7]
9 1
TRAIN: [0 1 2 3 4 6 7 8 9] TEST: [5]
9 1
TRAIN: [0 1 2 3 4 5 6 7 9] TEST: [8]
9 1
TRAIN: [0 1 2 4 5 6 7 8 9] TEST: [3]
9 1
TRAIN: [0 2 3 4 5 6 7 8 9] TEST: [1]
9 1
TRAIN: [0 1 2 3 4 5 7 8 9] TEST: [6]
9 1
TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [9]
9 1
TRAIN: [0 1 3 4 5 6 7 8 9] TEST: [2]
9 1


### Preprocessing

In [47]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in train_df['review']:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [49]:
len(counter)

102969

In [55]:
[vocab[i] for i in ['I', 'am', 'aaaaaaaaaaaaa', '<pad>']]

[0, 241, 0, 1]

In [65]:
def text_pipeline(X):
    if isinstance(X, list):
        return [[vocab[i] for i in tokenizer(text)] for text in X]
    return [vocab[i] for i in tokenizer(X)]

In [68]:
text_pipeline("I am a good boy! ADJISDAKSD unkqwjs <pad> <pad>")

[13, 241, 5, 57, 412, 36, 0, 0, 1, 1]

In [67]:
text_pipeline(["I am a good", "boy!"])

[[13, 241, 5, 57], [412, 36]]

### Data Iteration

In [85]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

df = train_df.iloc[:100]
train_iter = list(zip(df['review'], df['sentiment']))
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [86]:
for i in dataloader:
    print(i[0].shape, i[1].shape, i[2].shape)
    print(i)

torch.Size([8]) torch.Size([1882]) torch.Size([8])
(tensor([0, 1, 0, 0, 0, 1, 0, 1]), tensor([  13,  295,   14,  ..., 2629, 3377,    3]), tensor([   0,  210,  723,  733,  966, 1203, 1479, 1573]))
torch.Size([8]) torch.Size([1716]) torch.Size([8])
(tensor([0, 1, 0, 1, 1, 1, 0, 0]), tensor([  45,    2,   87,  ..., 1357, 1252,    3]), tensor([   0,  154,  307,  844,  857, 1055, 1240, 1702]))
torch.Size([8]) torch.Size([1589]) torch.Size([8])
(tensor([1, 1, 0, 0, 0, 0, 1, 1]), tensor([   95,    90,    31,  ..., 15920, 11537,     3]), tensor([   0,  285,  442,  836, 1132, 1370, 1560, 1577]))
torch.Size([8]) torch.Size([1216]) torch.Size([8])
(tensor([0, 1, 0, 0, 1, 0, 0, 1]), tensor([   14,    23,    10,  ...,  2445,    82, 13622]), tensor([   0,  149,  329,  544,  559,  776,  997, 1093]))
torch.Size([8]) torch.Size([2802]) torch.Size([8])
(tensor([0, 1, 1, 1, 0, 1, 1, 0]), tensor([2020, 3173,    4,  ...,    8, 3660,   36]), tensor([   0,  614,  906, 1201, 1258, 2379, 2391, 2663]))
torch.Si

## Model Selection

In order to validate the performance of model, the frequently adopted solutions are cross validation and the usage of validation set. Since the size of training samples is small, cross validation would be a more appropriate strategy.

1. RNN-based model
2. Naive-bayes model
3. Bert model

### Baseline - MLP

In [87]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [88]:
train_iter = list(zip(train_df['review'], train_df['sentiment']))
num_class = len(set([label for (text, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [89]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [96]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 3 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

# train_iter = list(zip(train_df.iloc[:1000]['review'], train_df.iloc[:1000]['sentiment']))

X, y = train_df.iloc[:1000]['review'], train_df['sentiment'].iloc[:1000].to_numpy()
# for train_index, test_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

# train_dataset = list(train_iter)
# test_dataset = list(test_iter)
# num_train = int(len(train_dataset) * 0.95)
# split_train_, split_valid_ = \
#     random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
#                              shuffle=True, collate_fn=collate_batch)

total_acc = []
for idx, (train_index, valid_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, "TEST:", test_index)
    print(f"Cross validation {idx}-fold")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_iter = list(zip(X_train, y_train))
    valid_iter = list(zip(X_valid, y_valid))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    cross_acc = None
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        acc = evaluate(valid_dataloader)
        if cross_acc is not None and cross_acc > acc:
            scheduler.step()
        else:
            cross_acc = acc
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                                acc))
        print('-' * 59)
    total_acc += [cross_acc]
print(total_acc)
print(np.mean(total_acc))

Cross validation 0-fold
-----------------------------------------------------------
| end of epoch   1 | time:  0.42s | valid accuracy    0.910 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.41s | valid accuracy    0.770 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.42s | valid accuracy    0.920 
-----------------------------------------------------------
Cross validation 1-fold
-----------------------------------------------------------
| end of epoch   1 | time:  0.43s | valid accuracy    0.980 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.42s | valid accuracy    0.990 
-----------------------------------------------------------
-----------------------------------------------

### 1. RNN-based model

In [None]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## Evaluation