# Sentiment Classification

The task is from [aidea](https://aidea-web.tw/topic/c4a666bb-7d83-45a6-8c3b-57514faf2901), the goal is to predict the sentiment of each article.

In [26]:
import torch
import numpy as np
import pandas as pd

## Data Preparation

In [31]:
from collections import Counter
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

In [27]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [28]:
train_df

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0
...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1
29337,453,An absorbing and unsettling psychological drama .,1
29338,13097,"Soylent Green IS...a really good movie, actual...",1
29339,26896,There just isn't enough here. There a few funn...,0


In [29]:
train_df.head()

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0


In [36]:
train_df['len'] = 0

In [37]:
lens = list()

for text in train_df['review']:
    lens += [len(tokenizer(text))]

In [38]:
train_df['len'] = lens

In [49]:
train_df[train_df['len'] > 600]['sentiment'].value_counts()

1    957
0    830
Name: sentiment, dtype: int64

In [35]:
c.most_common()

[(16, 203),
 (15, 189),
 (19, 186),
 (18, 184),
 (17, 183),
 (21, 169),
 (153, 162),
 (147, 162),
 (143, 161),
 (152, 160),
 (13, 158),
 (24, 158),
 (144, 157),
 (150, 156),
 (20, 153),
 (22, 153),
 (162, 152),
 (12, 151),
 (14, 150),
 (23, 149),
 (160, 149),
 (148, 148),
 (165, 147),
 (25, 147),
 (139, 146),
 (146, 145),
 (151, 142),
 (142, 141),
 (158, 141),
 (10, 139),
 (149, 139),
 (136, 139),
 (11, 139),
 (154, 138),
 (163, 137),
 (159, 136),
 (145, 136),
 (156, 132),
 (164, 131),
 (27, 130),
 (9, 128),
 (135, 127),
 (161, 127),
 (170, 126),
 (141, 125),
 (140, 123),
 (179, 122),
 (157, 120),
 (138, 120),
 (26, 120),
 (169, 119),
 (28, 118),
 (172, 117),
 (167, 116),
 (155, 116),
 (178, 116),
 (171, 116),
 (194, 114),
 (168, 114),
 (177, 114),
 (174, 113),
 (131, 113),
 (130, 111),
 (134, 111),
 (8, 109),
 (175, 109),
 (186, 108),
 (183, 106),
 (187, 106),
 (133, 104),
 (184, 104),
 (6, 102),
 (137, 101),
 (198, 100),
 (190, 100),
 (176, 100),
 (185, 99),
 (166, 99),
 (132, 98),
 

In [8]:
train_df.describe()

Unnamed: 0,ID,sentiment
count,29341.0,29341.0
mean,29348.411097,0.509662
std,17002.074346,0.499915
min,4.0,0.0
25%,14564.0,0.0
50%,29348.0,1.0
75%,44162.0,1.0
max,58681.0,1.0


In [9]:
train_df['sentiment'].value_counts()

1    14954
0    14387
Name: sentiment, dtype: int64

In [7]:
test_df

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
...,...,...
29336,30370,It is difficult to rate a writer/director's fi...
29337,18654,"After watching this movie once, it quickly bec..."
29338,47985,"Even though i sat and watched the whole thing,..."
29339,9866,Warning Spoilers following. Superb recreation ...


In [10]:
sample_df = pd.read_csv('../data/sample_submission.csv')

In [11]:
sample_df

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,1
2,17468,1
3,42579,1
4,701,1
...,...,...
29336,30370,1
29337,18654,1
29338,47985,1
29339,9866,1


### Data Split

In [65]:
from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=123) # Define the split - into 2 folds 
kf.get_n_splits(train_df) # returns the number of splitting iterations in the cross-validator

10

In [41]:
X.iloc[[0,1,2]]

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0


In [45]:
X, y = train_df, train_df['sentiment'].to_numpy()
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1 2 3 5 6 7 8 9] TEST: [4]
9 1
TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [0]
9 1
TRAIN: [0 1 2 3 4 5 6 8 9] TEST: [7]
9 1
TRAIN: [0 1 2 3 4 6 7 8 9] TEST: [5]
9 1
TRAIN: [0 1 2 3 4 5 6 7 9] TEST: [8]
9 1
TRAIN: [0 1 2 4 5 6 7 8 9] TEST: [3]
9 1
TRAIN: [0 2 3 4 5 6 7 8 9] TEST: [1]
9 1
TRAIN: [0 1 2 3 4 5 7 8 9] TEST: [6]
9 1
TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [9]
9 1
TRAIN: [0 1 3 4 5 6 7 8 9] TEST: [2]
9 1


### Preprocessing

In [47]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in train_df['review']:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [118]:
Vocab([i[0] for i in counter.most_common(10)])

TypeError: list indices must be integers or slices, not str

In [122]:
Vocab(Counter(dict(counter.most_common(10))))

<torchtext.vocab.Vocab at 0x140640a30>

In [49]:
len(counter)

102969

In [55]:
[vocab[i] for i in ['I', 'am', 'aaaaaaaaaaaaa', '<pad>']]

[0, 241, 0, 1]

In [65]:
def text_pipeline(X):
    if isinstance(X, list):
        return [[vocab[i] for i in tokenizer(text)] for text in X]
    return [vocab[i] for i in tokenizer(X)]

In [68]:
text_pipeline("I am a good boy! ADJISDAKSD unkqwjs <pad> <pad>")

[13, 241, 5, 57, 412, 36, 0, 0, 1, 1]

In [67]:
text_pipeline(["I am a good", "boy!"])

[[13, 241, 5, 57], [412, 36]]

### Data Iteration

In [255]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch, use_bag=False):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    if use_bag:
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
    else:
        offsets = torch.tensor(offsets[1:], dtype=torch.int64)
        text_list = pad_sequence(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

df = train_df.iloc[:100]
train_iter = list(zip(df['review'], df['sentiment']))
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [154]:
for i in dataloader:
    print(i[0].shape, i[1].shape, i[2].shape)
    print(i)

torch.Size([8]) torch.Size([513, 8]) torch.Size([8])
torch.Size([8]) torch.Size([513, 8]) torch.Size([8])
(tensor([0, 1, 0, 0, 0, 1, 0, 1]), tensor([[  13,   11,  197,  ...,   14,   13, 5130],
        [ 295,  130,   10,  ...,  213,   33, 1483],
        [  14,   29,   29,  ...,    9,    8,    3],
        ...,
        [   0,  272,    0,  ...,    0,    0,    0],
        [   0,  319,    0,  ...,    0,    0,    0],
        [   0,    3,    0,  ...,    0,    0,    0]]), tensor([210, 513,  10, 233, 237, 276,  94, 309]))
torch.Size([8]) torch.Size([537, 8]) torch.Size([8])
torch.Size([8]) torch.Size([537, 8]) torch.Size([8])
(tensor([0, 1, 0, 1, 1, 1, 0, 0]), tensor([[   45,     2,    14,  ...,    51,  4700,    18],
        [    2,  3725,   373,  ...,    26,    12,  3034],
        [   87,    31,   119,  ...,   531,   732,    18],
        ...,
        [    0,     0, 85389,  ...,     0,     0,     0],
        [    0,     0,   373,  ...,     0,     0,     0],
        [    0,     0,     3,  ...,   

## Model Selection

In order to validate the performance of model, the frequently adopted solutions are cross validation and the usage of validation set. Since the size of training samples is small, cross validation would be a more appropriate strategy.

1. RNN-based model
2. Naive-bayes model
3. Bert model

### Baseline -TFiDF

In [51]:
train_df['review']

0        I watched this film because I'm a big fan of R...
1        It does not seem that this movie managed to pl...
2              Enough is not a bad movie , just mediocre .
3        my friend and i rented this one a few nights a...
4        Just about everything in this movie is wrong, ...
                               ...                        
29336    It 's one of the most honest films ever made a...
29337    An absorbing and unsettling psychological drama .
29338    Soylent Green IS...a really good movie, actual...
29339    There just isn't enough here. There a few funn...
29340    This show was absolutely terrible. For one Geo...
Name: review, Length: 29341, dtype: object

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50000, stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b')
vectorizer.fit(train_df['review'])

TfidfVectorizer(max_features=50000, stop_words='english',
                token_pattern='(?u)\\b[A-Za-z]+\\b')

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b')
cv.fit(train_df['review'])

CountVectorizer(stop_words='english', token_pattern='(?u)\\b[A-Za-z]+\\b')

In [62]:
len(vectorizer.get_feature_names())

75046

In [63]:
X = train_df

In [68]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

In [76]:
def train(model, vectorizer, X):
    prf = {
        'accuracy': list(),
        'precision': list(),
        'recall': list(),
        'f1-score': list()
    }

    for idx, (train_index, valid_index) in enumerate(kf.split(X)):
        print(f"Cross validation {idx}-fold")
        train, valid = train_df.iloc[train_index], train_df.iloc[valid_index]
        X_train, y_train = train['review'], train['sentiment']
        X_valid, y_valid = valid['review'], valid['sentiment']

        clf = model.fit(vectorizer.transform(X_train), y_train)
        X_valid_transform = vectorizer.transform(X_valid)
        y_preds = clf.predict(X_valid_transform)
        results = precision_recall_fscore_support(y_valid, y_preds, average='binary')
        print(results)
        prf['accuracy'] += [clf.score(X_valid_transform, y_valid)]
        prf['precision'] += [results[0]]
        prf['recall'] += [results[1]]
        prf['f1-score'] += [results[2]]
    return prf

#### Tfidf + Logistic Regression

In [93]:
prf = {
    'accuracy': list(),
    'precision': list(),
    'recall': list(),
    'f1-score': list()
}

for idx, (train_index, valid_index) in enumerate(kf.split(X)):
    print(f"Cross validation {idx}-fold")
    train, valid = train_df.iloc[train_index], train_df.iloc[valid_index]
    X_train, y_train = train['review'], train['sentiment']
    X_valid, y_valid = valid['review'], valid['sentiment']
    
    clf = LogisticRegression().fit(vectorizer.transform(X_train), y_train)
    X_valid_transform = vectorizer.transform(X_valid)
    y_preds = clf.predict(X_valid_transform)
    results = precision_recall_fscore_support(y_valid, y_preds, average='binary')
    print(results)
    prf['accuracy'] += [clf.score(X_valid_transform, y_valid)]
    prf['precision'] += [results[0]]
    prf['recall'] += [results[1]]
    prf['f1-score'] += [results[2]]

Cross validation 0-fold
(0.855390574564235, 0.8910558170813719, 0.8728590250329381, None)
Cross validation 1-fold
(0.8485436893203884, 0.8936605316973415, 0.8705179282868525, None)
Cross validation 2-fold
(0.856957928802589, 0.8951994590939825, 0.8756613756613757, None)
Cross validation 3-fold
(0.8682519280205655, 0.8947019867549669, 0.8812785388127854, None)
Cross validation 4-fold
(0.8561335902376365, 0.8940308517773307, 0.8746719160104987, None)
Cross validation 5-fold
(0.8689384010484927, 0.8822355289421158, 0.8755364806866952, None)
Cross validation 6-fold
(0.8545688545688546, 0.8835662009314704, 0.868825646058227, None)
Cross validation 7-fold
(0.8598191214470284, 0.8993243243243243, 0.8791281373844122, None)
Cross validation 8-fold
(0.86691776522285, 0.9020248203788374, 0.884122919334187, None)
Cross validation 9-fold
(0.8526785714285714, 0.8895542248835662, 0.8707261478345816, None)


In [94]:
for k, v in prf.items():
    print(k, np.mean(v))

accuracy 0.8704543802380362
precision 0.8588200424661212
recall 0.8925353745865309
f1-score 0.8753328115102553


#### Tfidf + RandomForest

In [75]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
model = RandomForestClassifier(n_jobs=-1)
prf = train(model, vectorizer, X)

Cross validation 0-fold
(0.8378016085790885, 0.8406186953597848, 0.8392077878482713, None)
Cross validation 1-fold
(0.8282208588957055, 0.8282208588957055, 0.8282208588957055, None)
Cross validation 2-fold
(0.8199083169613621, 0.8465179175118324, 0.8330006653359947, None)
Cross validation 3-fold
(0.8421750663129973, 0.8410596026490066, 0.8416169648774022, None)
Cross validation 4-fold
(0.8337765957446809, 0.8410462776659959, 0.8373956594323873, None)
Cross validation 5-fold
(0.8387309980171844, 0.844311377245509, 0.8415119363395226, None)
Cross validation 6-fold
(0.8332212508406187, 0.8243512974051896, 0.82876254180602, None)
Cross validation 7-fold
(0.8367617783676178, 0.852027027027027, 0.8443254101104787, None)
Cross validation 8-fold
(0.8389391979301423, 0.8471587197909863, 0.8430289242768931, None)
Cross validation 9-fold
(0.8310502283105022, 0.8476380572188955, 0.839262187088274, None)


In [78]:
for k, v in prf.items():
    print(k, np.mean(v))

accuracy 0.833816060079268
precision 0.8340585899959899
recall 0.8412949830769934
f1-score 0.8376332936010948


#### Tfidf + MLP

In [85]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier()
prf = train(model, vectorizer, X)

#### Tfidf + NB

In [89]:
from sklearn.naive_bayes import BernoulliNB

In [90]:
model = BernoulliNB()
prf = train(model, vectorizer, X)

Cross validation 0-fold
(0.8171281390856407, 0.8533960995292535, 0.8348684210526316, None)
Cross validation 1-fold
(0.8045977011494253, 0.8588957055214724, 0.830860534124629, None)
Cross validation 2-fold
(0.8166772756206238, 0.8674780256930358, 0.8413114754098361, None)
Cross validation 3-fold
(0.8258675078864354, 0.866887417218543, 0.845880452342488, None)
Cross validation 4-fold
(0.8148148148148148, 0.8558014755197854, 0.8348053647366699, None)
Cross validation 5-fold
(0.8256765261170548, 0.8729208250166334, 0.8486416558861578, None)
Cross validation 6-fold
(0.8224358974358974, 0.8536260811709914, 0.8377407770159975, None)
Cross validation 7-fold
(0.8145569620253165, 0.8695945945945946, 0.8411764705882354, None)
Cross validation 8-fold
(0.8230529595015577, 0.8628347485303723, 0.8424744897959183, None)
Cross validation 9-fold
(0.8158730158730159, 0.854956753160346, 0.8349577647823263, None)


In [91]:
for k, v in prf.items():
    print(k, np.mean(v))

accuracy 0.8318395037212776
precision 0.8180680799509783
recall 0.8616391725955028
f1-score 0.8392717405734891


### Baseline - MLP

In [87]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [88]:
train_iter = list(zip(train_df['review'], train_df['sentiment']))
num_class = len(set([label for (text, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [135]:
import time
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, F.one_hot(label, num_classes=num_class).type(torch.FloatTensor))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_count = 0
    all_preds, all_labels = list(), list()
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            predicted_label = predicted_label.argmax(1)
            all_preds += [predicted_label.detach().numpy()]
            all_labels += [label.detach().numpy()]
            total_count += label.size(0)
    print(all_preds, all_labels, total_count)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    prf = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    print(prf)
    return (all_preds == all_labels).mean()

In [136]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 1 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

# train_iter = list(zip(train_df.iloc[:1000]['review'], train_df.iloc[:1000]['sentiment']))

X, y = train_df.iloc[:1000]['review'], train_df['sentiment'].iloc[:1000].to_numpy()
# for train_index, test_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

# train_dataset = list(train_iter)
# test_dataset = list(test_iter)
# num_train = int(len(train_dataset) * 0.95)
# split_train_, split_valid_ = \
#     random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
#                               shuffle=True, collate_fn=collate_batch)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
#                              shuffle=True, collate_fn=collate_batch)
print(model)
total_acc = []
for idx, (train_index, valid_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, "TEST:", test_index)
    print(f"Cross validation {idx}-fold")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_iter = list(zip(X_train, y_train))
    valid_iter = list(zip(X_valid, y_valid))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    cross_acc = None
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        acc = evaluate(valid_dataloader)
        if cross_acc is not None and cross_acc > acc:
            scheduler.step()
        else:
            cross_acc = acc
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                                acc))
        print('-' * 59)
    total_acc += [cross_acc]
print(total_acc)
print(np.mean(total_acc))

TextClassificationModel(
  (embedding): EmbeddingBag(102971, 64, mode=mean)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)
Cross validation 0-fold
[array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1]), array([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0])] [array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1]), array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0])] 100
(0.6933333333333334, 1.0, 0.8188976377952756, None)
-----------------------------------------------------------
| end of epoch   1 | time:  0.45s | valid ac

### 1. RNN-based model

In [256]:
import torch
from torch import nn
from torch.nn import functional as F


class Attention(nn.Module):
    # target is hidden_size
    def __init__(self, hidden_size, method='concat'):
        super(Attention, self).__init__()
        self.method = method
        if method not in ('dot', 'general', 'concat'):
            raise NotImplemented
        if method == 'general':
            self.attn = nn.Linear(hidden_size, hidden_size)
        elif method == 'concat':
            self.attn = nn.Linear(2 * hidden_size, hidden_size)
            self.v = nn.Linear(hidden_size, 1, bias=False)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        if hasattr(self, 'attn'):
            self.attn.weight.data.uniform_(-initrange, initrange)
            self.attn.bias.data.zero_()
        if hasattr(self, 'v'):
            self.v.weight.data.uniform_(-initrange, initrange)

    def dot_score(self, hidden, encoder_output):
        return torch.matmul(hidden, encoder_output)

    def general_score(self, hidden, encoder_output):
        attn = self.attn(encoder_output)
        return torch.matmul(hidden, attn)

    def concat_score(self, hidden, encoder_output):
        hidden_reshape = torch.unsqueeze(hidden, dim=0).repeat(encoder_output.size(0), 1, 1)
        attn = self.attn(torch.cat([hidden_reshape, encoder_output], dim=-1)).tanh()
        return self.v(attn).squeeze(dim=-1)

    def forward(self, hidden, encoder_output):
        # output = [lengths x batch_size x hidden_size]
        # hidden = [batch_size x hidden_size]
        attn_scores = None
        if self.method == 'dot':
            attn_scores = self.dot_score(hidden, encoder_output)
        elif self.method == 'general':
            attn_scores = self.general_score(hidden, encoder_output)
        elif self.method == 'concat':
            attn_scores = self.concat_score(hidden, encoder_output)

        # [lengths x batch_size] -> [batch_size x lengths]
        attn_scores = attn_scores.t()
        # return [batch_size x 1 x lengths]
        return F.softmax(attn_scores, dim=-1).unsqueeze(1)


In [257]:
t = torch.tensor([5,3,7,2,1])
sorted_t, idx = t.sort(descending=True)
print(sorted_t)
print(torch.gather(t, 0, torch.arange(0, idx.shape[0], dtype=torch.int64)))

tensor([7, 5, 3, 2, 1])
tensor([5, 3, 7, 2, 1])


In [261]:
from torch import nn
from torch.nn import init


def sort_sequence(inputs, lengths):
    sorted_lengths, sorted_idx = lengths.sort(descending=True)
    return inputs[sorted_idx], sorted_lengths, sorted_idx


class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, n_layers, dropout, num_classes, attention_mode, padding_idx=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, sparse=True, padding_idx=1)
        self.lstm = nn.LSTM(embed_size, hidden_size, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.attn = Attention(2 * hidden_size, attention_mode)
        self.fc = nn.Linear(2 * hidden_size, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for param in self.lstm.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.uniform_(param.data, -initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, text_lengths, hidden=None):
        sorted_lengths, sorted_idx = text_lengths.sort(descending=True)
        sorted_text = torch.index_select(text, -1, sorted_idx)
        emb = self.embedding(sorted_text)
        packed = nn.utils.rnn.pack_padded_sequence(emb, sorted_lengths)
        outputs, hidden = self.lstm(packed, hidden)
        hidden_state, _ = hidden
        hidden_state = hidden_state[-2:,:,:].view(1, -1, 2 * hidden_size).squeeze(0)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        attn_weights = self.attn(hidden_state, outputs)
        # attn_weights = [batch_size x 1 x lengths]
        context = torch.bmm(attn_weights, outputs.transpose(0, 1)).squeeze(1)
        output = self.fc(context)
        output = torch.index_select(output, 0, torch.arange(0, sorted_idx.shape[0], dtype=torch.int64))
        return output, hidden

In [262]:
embed_size = 50
hidden_size = 256
n_layers = 2
dropout = 0.1
num_classes = 2
attention_mode = 'concat'
model = LSTMModel(vocab_size, embed_size, hidden_size, n_layers, 
                  dropout, num_classes, attention_mode).to(device)

In [264]:
import time
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label, _ = model(text, offsets)
        loss = criterion(predited_label, F.one_hot(label, num_classes=num_class).type(torch.FloatTensor))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_count = 0
    all_preds, all_labels = list(), list()
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label, _ = model(text, offsets)
            predicted_label = predicted_label.argmax(1)
            all_preds += [predicted_label.detach().numpy()]
            all_labels += [label.detach().numpy()]
            total_count += label.size(0)
    print(all_preds, all_labels, total_count)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    prf = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    print(prf)
    return (all_preds == all_labels).mean()

In [None]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 1 # epoch
LR = 5  # learning rate
BATCH_SIZE = 32 # batch size for training

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

X, y = train_df.iloc[:1000]['review'], train_df['sentiment'].iloc[:1000].to_numpy()
print(model)
total_acc = []
for idx, (train_index, valid_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, "TEST:", test_index)
    print(f"Cross validation {idx}-fold")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_iter = list(zip(X_train, y_train))
    valid_iter = list(zip(X_valid, y_valid))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    cross_acc = None
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        acc = evaluate(valid_dataloader)
        if cross_acc is not None and cross_acc > acc:
            scheduler.step()
        else:
            cross_acc = acc
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                                acc))
        print('-' * 59)
    total_acc += [cross_acc]
print(total_acc)
print(np.mean(total_acc))

LSTMModel(
  (embedding): Embedding(102971, 50, padding_idx=1, sparse=True)
  (lstm): LSTM(50, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (attn): Attention(
    (attn): Linear(in_features=1024, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
  (fc): Linear(in_features=512, out_features=2, bias=True)
)
Cross validation 0-fold
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([4, 2])
torch.Size([32, 2])

### Bert-based Model

In [1]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [4]:
text = "I am a good person, and how about you?"
print(' Original: ', text)
print('Tokenized: ', tokenizer.tokenize(text))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(["[CLS]"] + tokenizer.tokenize(text)+ ["SEP"])

 Original:  I am a good person, and how about you?
Tokenized:  ['i', 'am', 'a', 'good', 'person', ',', 'and', 'how', 'about', 'you', '?']
Token IDs:  [101, 1045, 2572, 1037, 2204, 2711, 1010, 1998, 2129, 2055, 2017, 1029, 100]


In [5]:
input_ids = tokenizer.encode(text, add_special_tokens=True)

In [6]:
input_ids

[101, 1045, 2572, 1037, 2204, 2711, 1010, 1998, 2129, 2055, 2017, 1029, 102]

In [7]:
tokenizer.encode_plus(
                text, add_special_tokens=True, max_length=64,
                pad_to_max_length=True, return_attention_mask=True,
                return_tensors='pt'
            )

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[ 101, 1045, 2572, 1037, 2204, 2711, 1010, 1998, 2129, 2055, 2017, 1029,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [9]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### XLNet

In [1]:
import torch

In [2]:
from transformers import XLNetTokenizer
from transformers import XLNetModel, XLNetForSequenceClassification

In [4]:
model = XLNetModel.from_pretrained('xlnet-base-cased', output_hidden_states=True)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
model

XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, b

In [6]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [14]:
text = 'I am a good boy'

In [15]:
tokenizer.tokenize(text)

['▁', 'i', '▁am', '▁a', '▁good', '▁boy']

In [22]:
encoded_dict = tokenizer.encode_plus(
    "<cls> asd11l2js I am a good boy", add_special_tokens=True, max_length=50,
    pad_to_max_length=True, return_attention_mask=True,
    return_tensors='pt'
)

In [23]:
encoded_dict

{'input_ids': tensor([[   5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
            5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
            5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    3,   34,
           66, 1545,  368,  184, 1315,   23,   17,  150,  569,   24,  195, 2001,
            4,    3]]), 'token_type_ids': tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 2]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [24]:
text_list, attention_masks = list(), list()
text_list.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
text_list = torch.cat(text_list, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [10]:
outputs = model(text_list, attention_masks)

In [11]:
outputs

XLNetModelOutput(last_hidden_state=tensor([[[ 1.4764,  2.5515, -0.8054,  ..., -0.7408,  0.5854, -2.2659],
         [ 1.6141,  2.4739, -0.7761,  ..., -0.7215,  0.4234, -2.3880],
         [ 1.5958,  2.4999, -0.7793,  ..., -0.6517,  0.2706, -2.4049],
         ...,
         [ 3.8463,  0.2613, -1.5299,  ..., -2.6709, -0.2188,  0.2802],
         [ 5.7440, -0.2734, -3.4873,  ..., -3.3266,  0.4171,  0.4994],
         [ 3.7563, -0.2951, -2.6874,  ..., -1.8446,  0.5208,  0.9403]]],
       grad_fn=<PermuteBackward>), mems=(tensor([[[ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252]],

        [[ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252]],

        [[ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252]],

        ...,

        [[ 0.0540,  0.0375,  0.0332,  ...,  0.0201, -0.0480,  0.0724]],

        [[ 0.0788, -0.0583, -0.0905,  ...,  0.0493,  0.0634, -0.0520]],

        [[ 0.0181, -0.0015, -0.1494,  ...,  0.0012, -0.0009,  0.0188]]]), tensor([[[ 0.9156,  1.0634, -

In [13]:
outputs.hidden_states

(tensor([[[ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252],
          [ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252],
          [ 0.0344,  0.0202,  0.0261,  ..., -0.0175, -0.0343,  0.0252],
          ...,
          [ 0.0540,  0.0375,  0.0332,  ...,  0.0201, -0.0480,  0.0724],
          [ 0.0788, -0.0583, -0.0905,  ...,  0.0493,  0.0634, -0.0520],
          [ 0.0181, -0.0015, -0.1494,  ...,  0.0012, -0.0009,  0.0188]]],
        grad_fn=<PermuteBackward>),
 tensor([[[ 0.9156,  1.0634, -0.2741,  ..., -0.1662, -0.1363, -0.8428],
          [ 0.9461,  1.0958, -0.2464,  ..., -0.1756, -0.1703, -0.8440],
          [ 0.9758,  1.1076, -0.2352,  ..., -0.1836, -0.1847, -0.8192],
          ...,
          [ 0.8328,  0.5979,  0.6304,  ..., -0.4727, -1.2793,  1.6664],
          [ 1.0250, -0.4521, -1.5119,  ..., -0.0787,  0.4344,  0.7233],
          [ 0.1938,  0.1017, -2.3535,  ..., -0.5091, -0.1052,  1.6627]]],
        grad_fn=<PermuteBackward>),
 tensor([[[ 0.3933,  0.3043, -

In [None]:
outputs

## Evaluation