In [1]:
import os
import tarfile

In [2]:
from dstoolbox.transformers import Padder2d
from dstoolbox.transformers import TextFeaturizer
import numpy as np
from scipy import stats
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from skorch import NeuralNetClassifier
import torch
from torch import nn

F = nn.functional

In [3]:
np.random.seed(0)

In [4]:
VOCAB_SIZE = 1000  # This is on the low end
MAX_LEN = 50  # Texts are pretty long on average, this is on the low end
USE_CUDA = True  # Set this to False if you don't want to use CUDA
NUM_CV_STEPS = 10  # Number of randomized search steps to perform

# Load data

In [5]:
if not os.path.exists("aclImdb"):
    # unzip data if it does not exist
    with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as f:
        f.extractall()

In [6]:
dataset = load_files("aclImdb/train/", categories=["pos", "neg"])

In [7]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [8]:
X, y = dataset["data"], dataset["target"]
X = np.asarray([x.decode() for x in X])  # decode from bytes

In [11]:
X[0]

"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [15]:
np.unique(y)

array([0, 1])

In [17]:
dataset["target_names"]

['neg', 'pos']

In [16]:
for text, target in zip(X[:3], y):
    print("Target: {}".format(dataset["target_names"][target]))
    print(text)
    print()

Target: pos
Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty.

Target: neg
Words can't describe how bad this movie is. I can't explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There 

# Transform data

In [18]:
steps = [
    ("to_idx", TextFeaturizer(max_features=VOCAB_SIZE)),
    ("pad", Padder2d(max_len=MAX_LEN, pad_value=VOCAB_SIZE, dtype=int)),
]
Pipeline(steps).fit_transform(X[:3])

array([[220,  48, 104, 217, 190, 186,  63, 156, 186, 207, 193,  29, 218,
        117, 215,  57, 205, 184,  54,  43, 129, 173, 199, 169, 181,  39,
        102,  35, 205, 128,  19,  26,  27, 120, 133,  23,  76, 193,  95,
        206,  87,  49, 190, 210,  77,  44,  38,  98, 140, 190],
       [213,  33,  52,  94,  18, 187, 124, 101,  33,  67, 102,  32, 216,
        137, 217,  87, 191, 163, 102,  76, 219, 190,  78,  17,  83, 133,
         94,  93, 124, 158,  33,  19, 132, 179, 159, 217, 190,  57, 179,
        183,  14, 170, 115,  40, 119,  12,   8, 142, 130, 185],
       [ 65, 151, 181, 148, 153, 203,  98, 187, 108, 131, 124,  24,  79,
        180,  36, 190, 109, 148, 133,  90, 105,  56,  31,  62, 195, 157,
        179, 205,  88,  85, 201,  81, 190,  19, 103,  16,  82, 139, 116,
         63,  25, 180, 124, 166, 196, 179, 202, 143, 190, 174]])

# Model

In [20]:
class RNNClassifier(nn.Module):
    def __init__(
        self,
        embedding_dim=128,
        rec_layer_type="lstm",
        num_units=128,
        num_layers=2,
        dropout=0,
    ):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.rec_layer_type = rec_layer_type.lower()
        self.num_units = num_units
        self.num_layers = num_layers
        self.dropout = dropout

        self.emb = nn.Embedding(VOCAB_SIZE + 1, embedding_dim=self.embedding_dim)

        rec_layer = {"lstm": nn.LSTM, "gru": nn.GRU}[self.rec_layer_type]
        # We have to make sure that the recurrent layer is batch_first,
        # since sklearn assumes the batch dimension to be the first
        self.rec = rec_layer(
            self.embedding_dim, self.num_units, num_layers=num_layers, batch_first=True
        )

        self.output = nn.Linear(self.num_units, 2)

    def forward(self, X):
        embeddings = self.emb(X)
        # from the recurrent layer, only take the activities from the last sequence step
        if self.rec_layer_type == "gru":
            _, rec_out = self.rec(embeddings)
        else:
            _, (rec_out, _) = self.rec(embeddings)
        rec_out = rec_out[-1]  # take output of last RNN layer
        drop = F.dropout(rec_out, p=self.dropout)
        # Remember that the final non-linearity should be softmax, so that our predict_proba
        # method outputs actual probabilities!
        out = F.softmax(self.output(drop), dim=-1)
        return out

In [21]:
steps.append(
    (
        "net",
        NeuralNetClassifier(
            RNNClassifier,
            device=("cuda" if USE_CUDA else "cpu"),
            max_epochs=5,
            lr=0.01,
            optimizer=torch.optim.RMSprop,
        ),
    )
)

In [22]:
pipe = Pipeline(steps)

In [23]:
%time pipe.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.7681[0m       [32m0.5000[0m        [35m0.6933[0m  1.2950
      2        [36m0.7042[0m       [32m0.5112[0m        [35m0.6925[0m  1.3084
      3        [36m0.7006[0m       [32m0.5162[0m        0.7090  1.3277
      4        [36m0.6522[0m       [32m0.6780[0m        [35m0.6275[0m  1.3339
      5        [36m0.5431[0m       [32m0.7296[0m        [35m0.5505[0m  1.4967
CPU times: user 21.1 s, sys: 1.84 s, total: 23 s
Wall time: 23 s


Pipeline(memory=None,
         steps=[('to_idx',
                 TextFeaturizer(analyzer='word', binary=False,
                                decode_error='strict',
                                dtype=<class 'numpy.int64'>, encoding='utf-8',
                                input='content', lowercase=True, max_df=1.0,
                                max_features=1000, min_df=1, ngram_range=(1, 1),
                                preprocessor=None, stop_words=None,
                                strip_accents=None,
                                token_pattern='(?u)\\b\\w\\w+\\b',
                                tokenizer=None, unknown_token=None,
                                vocabulary=None)),
                ('pad',
                 Padder2d(dtype=<class 'int'>, max_len=50, pad_value=1000)),
                ('net',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=RNNClassifier(
    (emb): Embedding(1001, 128)
    (rec): LSTM(128, 128, num

# Randomized search

In [24]:
pipe.set_params(net__verbose=0, net__train_split=None)

Pipeline(memory=None,
         steps=[('to_idx',
                 TextFeaturizer(analyzer='word', binary=False,
                                decode_error='strict',
                                dtype=<class 'numpy.int64'>, encoding='utf-8',
                                input='content', lowercase=True, max_df=1.0,
                                max_features=1000, min_df=1, ngram_range=(1, 1),
                                preprocessor=None, stop_words=None,
                                strip_accents=None,
                                token_pattern='(?u)\\b\\w\\w+\\b',
                                tokenizer=None, unknown_token=None,
                                vocabulary=None)),
                ('pad',
                 Padder2d(dtype=<class 'int'>, max_len=50, pad_value=1000)),
                ('net',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=RNNClassifier(
    (emb): Embedding(1001, 128)
    (rec): LSTM(128, 128, num

In [25]:
params = {
    "to_idx__stop_words": ["english", None],
    "to_idx__lowercase": [False, True],
    "to_idx__ngram_range": [(1, 1), (2, 2)],
    "net__module__embedding_dim": stats.randint(32, 256 + 1),
    "net__module__rec_layer_type": ["gru", "lstm"],
    "net__module__num_units": stats.randint(32, 256 + 1),
    "net__module__num_layers": [1, 2, 3],
    "net__module__dropout": stats.uniform(0, 0.9),
    "net__lr": [10 ** (-stats.uniform(1, 5).rvs()) for _ in range(NUM_CV_STEPS)],
    "net__max_epochs": [5, 10],
}

In [26]:
search = RandomizedSearchCV(
    pipe, params, n_iter=NUM_CV_STEPS, verbose=2, refit=False, scoring="accuracy", cv=3
)

In [27]:
%time search.fit(X[:5000], y[:5000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None, total=   4.5s
[CV] net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s


[CV]  net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None, total=   4.4s
[CV] net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None 
[CV]  net__lr=0.0006487299427756309, net__max_epochs=10, net__module__dropout=0.47600542777761407, net__module__embedding_dim=120, net__module__num_layers=2, net__module__num_units=197, net__module__rec_layer_type=lstm, to_idx__lowercase=True, to_idx__ngram_range=(1, 1), to_idx__stop_words=None, total=   4.4s
[CV] net__lr=0.0007615983654766594, net__max_epochs=10, net__module__dropout=0.331

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('to_idx',
                                              TextFeaturizer(analyzer='word',
                                                             binary=False,
                                                             decode_error='strict',
                                                             dtype=<class 'numpy.int64'>,
                                                             encoding='utf-8',
                                                             input='content',
                                                             lowercase=True,
                                                             max_df=1.0,
                                                             max_features=1000,
                                                             min_df=1,
                                                             ngram