In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext.vocab import Vectors
import random
from tqdm import tqdm
import time
import string
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec, KeyedVectors
from google.colab import files

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
alphabet = string.ascii_lowercase

In [4]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"happypuffin7","key":"510774de9e6f1d45f5c1decaf6107cb4"}'}

In [0]:
!pip install -q kaggle

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [0]:
!chmod 600 /root/.kaggle/kaggle.json

In [8]:
!kaggle datasets download -d utathya/imdb-review-dataset

Downloading imdb-review-dataset.zip to /content
 65% 33.0M/50.5M [00:00<00:00, 42.1MB/s]
100% 50.5M/50.5M [00:00<00:00, 65.1MB/s]


In [9]:
!unzip imdb-review-dataset.zip

Archive:  imdb-review-dataset.zip
  inflating: imdb_master.csv         


In [0]:
df = pd.read_csv('imdb_master.csv', encoding='latin')

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [0]:
symbols_to_be_removed = set(' '.join(list(df[df['type'] != 'test'].review)).lower()) - (set(alphabet) | {' ', '\n', '\t'})

In [13]:
''.join(symbols_to_be_removed)

'¿á\x181³7[]&å¾$©ð*®\\°\x08%-ë?¡µ5¥/«²2{§6>æ)±\'¬;¢8ªº¯·è0|~ï!"¦+´\xad¤¶@9ã}¼â\x10¨_½»#(`:¸£ä,=3\xa0.^<4¹'

In [0]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    for ch in symbols_to_be_removed:
        if ch in text:
            text = text.replace(ch, '')
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [0]:
# нормализауем текст, чтобы повысить качество: помогло повысить accuracy с 87.75% до 88.84%
df['preprocessed_review'] = df['review'].apply(preprocess_text)

In [16]:
df['type'].unique()

array(['test', 'train'], dtype=object)

In [17]:
df[df['type'] == 'train']['label'].unique(), df[df['type'] == 'test']['label'].unique()

(array(['neg', 'pos', 'unsup'], dtype=object),
 array(['neg', 'pos'], dtype=object))

In [0]:
df_unsup = df[df['label'] == 'unsup'][['preprocessed_review', 'label']]

In [0]:
df_train = df[df['type'] == 'train'][['preprocessed_review', 'label']]
df_test = df[df['type'] == 'test'][['preprocessed_review', 'label']]

In [0]:
df_train = df_train[(df['label'] == 'neg') | (df['label'] == 'pos')]

In [0]:
df_train_and_unsup = df[df['type'] == 'train'][['preprocessed_review', 'label']]

In [22]:
df_train.shape, df_test.shape, df_unsup.shape, df_train_and_unsup.shape

((25000, 2), (25000, 2), (50000, 2), (75000, 2))

In [23]:
df_train.head()

Unnamed: 0,preprocessed_review,label
25000,story of a man who ha unnatural feeling for a ...,neg
25001,airport start a a brand new luxury plane is lo...,neg
25002,this film lacked something i couldnt put my fi...,neg
25003,sorry everyone i know this is supposed to be a...,neg
25004,when i wa little my parent took me along to th...,neg


CNN

In [0]:
classes = ('neg', 'pos')

In [0]:
class DatasetFromDataFrame(data.Dataset):
    """
    A class to convert pandas DataFrame to torchtext Dataset
    """
    def __init__(self, path, text_field, label_field, col, gt, dfs, **kwargs):
        fields = [("text", text_field), ("label", label_field)]
        examples = []
        shape_ = dfs[path].values[:,1].shape[0]
        for i in tqdm(range(shape_), total=shape_, desc=f"Example:"):
            text = dfs[path][col].iloc[i]
            label = dfs[path][gt].iloc[i]
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    
    # @classmethod
    # def splits(cls, text_field, label_field, col, label, train, validation=None, test=None, **kwargs):
    #     dfs = {'train': train}
    #     if validation is not None:
    #         dfs['validation'] = validation
    #         has_validation = 'validation'
    #     else:
    #         has_validation = None
    #     if test is not None:
    #         dfs['test'] = test
    #         has_test = 'test'
    #     else:
    #         has_test = None
    #     return super().splits('',
    #         text_field=text_field, label_field=label_field, col=col, gt=label, 
    #                           train='train', validation=has_validation, test=has_test,  dfs=dfs, **kwargs)

In [0]:
TEXT = data.Field(tokenize='spacy', batch_first=True, lower=True)
LABEL = data.LabelField(dtype=torch.float)

In [27]:
dfs = {0: df_train, 1: df_test, 2: df_train_and_unsup}
train_dataset = DatasetFromDataFrame(0, TEXT, LABEL, 'preprocessed_review', 'label', dfs)
test_dataset = DatasetFromDataFrame(1, TEXT, LABEL, 'preprocessed_review', 'label', dfs)
train_and_unsup_dataset = DatasetFromDataFrame(2, TEXT, LABEL, 'preprocessed_review', 'label', dfs)

Example:: 100%|██████████| 25000/25000 [00:34<00:00, 734.87it/s]
Example:: 100%|██████████| 25000/25000 [00:33<00:00, 746.50it/s]
Example:: 100%|██████████| 75000/75000 [01:43<00:00, 722.04it/s]


In [0]:
# Я пробовал с valid_dataset и за 5 эпох модель ещё не успевает переобучиться, поэтому чтобы сохранить максимум информации
# для обучающей выборки я решил обойтись без валидационной выборки. Но вообще можно разбить выборку вот так (и раскомментить выше):
# train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(SEED), split_ratio=0.9)

In [28]:
# Как видно, обучающая выборка небольшая, поэтому решили обойтись без валидационной выборки
len(train_dataset), len(test_dataset)

(25000, 25000)

Я пробовал использовать "glove.6B.100d" эмбеддинги, но максимальное качество accuracy только 88.84%, поэтому самостоятельно предобучаем word2vec эмбеддинги и вставляем их в build_vocab от torchtext-а.

Именно здесь я при обучении использую unsupervised data ([2 points] for using unsupervised data). По поводу использования unsupervised data были также следующие идеи: поскольку наша модель достигает на тесте > 85%, то можно предиктить с помощью нашей модели на unsupervised data и отбирать случаи, когда модель особенно уверена в своих предсказаниях, и добавлять их в train_dataset, но поскольку качество 90% уже удалось достичь просто с помощью преобученных эмбеддингов, то я не стал это реализовывать.

Эмбеддинги я обучаю на всей train-выборке (т.е. не только на unsupervised data, но и на supervised data в train-е).

In [0]:
sentences = [sent.split() for sent in list(df_train_and_unsup.preprocessed_review)]

In [0]:
## constants

EMBEDDING_DIM = 100
# константа определяется дефолтными min_count и другими параметрами в классе Word2Vec в gensim-е
MAX_VOCAB = 49347
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
w2v_model = Word2Vec(sentences, size=EMBEDDING_DIM)
weights = torch.FloatTensor(w2v_model.wv.vectors)
w2v_model.wv.save_word2vec_format('w2v_embeddings_100')

In [0]:
# чтобы tqdm нормально себя вёл
for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [33]:
vectors = Vectors(name='w2v_embeddings_100', cache='./')

TEXT.build_vocab(
    train_and_unsup_dataset,
    vectors=vectors,  # "glove.6B.100d"
    max_size=MAX_VOCAB,
    unk_init=torch.Tensor.normal_
)

LABEL.build_vocab(train_dataset)

  0%|          | 0/49349 [00:00<?, ?it/s]Skipping token b'49349' with 1-dimensional vector [b'100']; likely a header
 99%|█████████▉| 48907/49349 [00:01<00:00, 29023.13it/s]


In [0]:
train_iterator, train_and_unsup_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, train_and_unsup_dataset, test_dataset), batch_size=BATCH_SIZE, device=device
)

In [0]:
class NeuralNet(nn.Module):
    def __init__(self, vocab_size, pad_idx, num_filters, filter_sizes):
        super().__init__()

        # embeddings
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=pad_idx)

        # list of convolutions
        self.convolutions = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=1,  # because we have texts, not images
                    out_channels=num_filters,
                    kernel_size=(filter_size, EMBEDDING_DIM)
                )
                for filter_size in filter_sizes
            ]
        )

        # fully connected layer
        self.linear = nn.Linear(len(filter_sizes) * num_filters, 1)

    def forward(self, text):
        # get embeddings
        embeddings = self.embedding(text)

        # add dummy channel (second) dimension
        embeddings = embeddings.unsqueeze(1)

        # apply convolutions (using relu activation function after convolution)
        convolved = [F.relu(convolution(embeddings)).squeeze(3) for convolution in self.convolutions]

        # apply max pooling
        pooled = [F.max_pool1d(convolved_, convolved_.shape[2]).squeeze(2) for convolved_ in convolved]

        # concatenate
        cat = torch.cat(pooled, dim=1)

        # apply fully connected layer
        result = self.linear(cat)

        return result

In [0]:
VOCAB_SIZE = len(TEXT.vocab)
NUM_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_100 = NeuralNet(VOCAB_SIZE, PAD_IDX, NUM_FILTERS, FILTER_SIZES)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
model_100.embedding.weight.data.copy_(pretrained_embeddings);

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model_100.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_100.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [39]:
print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
print("Label Length: " + str(len(LABEL.vocab)))

Length of Text Vocabulary: 49349
Vector size of Text Vocabulary:  torch.Size([49349, 100])
Label Length: 2


In [0]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_100.parameters())

In [0]:
def accuracy(y_preds, y_true):
    y_preds = torch.round(torch.sigmoid(y_preds))
    preds = (y_preds == y_true).float()
    return preds.sum() / len(preds)

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        y_preds = model(batch.text).squeeze(1)
        loss = criterion(y_preds, batch.label)
        epoch_loss += loss.item()
        epoch_acc += accuracy(y_preds, batch.label).item()
        loss.backward()
        optimizer.step()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            y_preds = model(batch.text).squeeze(1)
            loss = criterion(y_preds, batch.label)
            epoch_loss += loss.item()
            epoch_acc += accuracy(y_preds, batch.label).item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def train_and_evaluate_model(model, num_epochs=5):
    best_valid_loss = float('inf')
    model = model.to(device)

    # train model
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        print(f'Epoch: {epoch + 1}')
        print(f'train loss: {train_loss:.3f}, train acc: {train_acc*100:.2f}%')

    # evaluate model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'test loss: {test_loss:.3f}, test acc: {test_acc*100:.2f}%')

In [45]:
train_and_evaluate_model(model_100)

Epoch: 1
train loss: 0.425, train acc: 80.05%
Epoch: 2
train loss: 0.265, train acc: 89.08%
Epoch: 3
train loss: 0.203, train acc: 92.31%
Epoch: 4
train loss: 0.155, train acc: 94.53%
Epoch: 5
train loss: 0.116, train acc: 96.35%
test loss: 0.248, test acc: 90.10%


Как видно, модель не сильно переобучается, хотя на большем количестве данных и на большем количестве эпох качество, очевидно, было бы лучше. Но лучше be on the safe side и не переобучать модель.

In [0]:
# You have to provide data for trials with different hyperparameter values.
# Вообще говоря мы уже достигли качества 90%, но поскольку нужно предоставить подбор гиперпараметра,
# давайте попробуем увеличить размерность embedding-а (300 вместо 100) и увеличим количество фильтров
# и количество размеров фильтров:
NUM_FILTERS = 300
FILTER_SIZES = [2, 3, 4, 5, 6]
EMBEDDING_DIM = 300

In [0]:
w2v_model = Word2Vec(sentences, size=EMBEDDING_DIM)
weights = torch.FloatTensor(w2v_model.wv.vectors)
w2v_model.wv.save_word2vec_format('w2v_embeddings_300')

In [48]:
vectors = Vectors(name='w2v_embeddings_300', cache='./')

TEXT.build_vocab(
    train_and_unsup_dataset,
    vectors=vectors,  # "glove.6B.100d"
    max_size=MAX_VOCAB,
    unk_init=torch.Tensor.normal_
)

  0%|          | 0/49349 [00:00<?, ?it/s]Skipping token b'49349' with 1-dimensional vector [b'300']; likely a header
 98%|█████████▊| 48277/49349 [00:04<00:00, 11626.22it/s]


In [0]:
model_300 = NeuralNet(VOCAB_SIZE, PAD_IDX, NUM_FILTERS, FILTER_SIZES)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
model_300.embedding.weight.data.copy_(pretrained_embeddings);

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model_300.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_300.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_300.parameters())

In [53]:
train_and_evaluate_model(model_300)

Epoch: 1
train loss: 0.414, train acc: 81.89%
Epoch: 2
train loss: 0.207, train acc: 91.87%
Epoch: 3
train loss: 0.106, train acc: 96.68%
Epoch: 4
train loss: 0.035, train acc: 99.49%
Epoch: 5
train loss: 0.010, train acc: 99.99%
test loss: 0.266, test acc: 90.94%


Здесь уже модель по сути выучила всё, что было в train-корпусе. Поэтому дальнейшего повышения качества нужно добиваться не столько изменением архитектуры (поскольку при такой архитектуре за 5 эпох уже всё выучено и дальше будет только переобучение), сколько, видимо, более точными embedding-ами и добавлением unsupervised-данных в train-выборку (отбор тех, на которых модель с 90% accuracy уверена, т.е. нужно выбрать порог).