In [None]:
"""
https://nbviewer.jupyter.org/github/cezannec/CNN_Text_Classification/blob/master/CNN_Text_Classification.ipynb
https://github.com/cezannec/CNN_Text_Classification

In this notebook, I'll train a CNN to classify the sentiment of movie reviews in a corpus of text.
"""

In [28]:
import numpy as np
from string import punctuation
from collections import Counter
from gensim.models import KeyedVectors
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

Get the data

In [6]:
#! wget https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/labels.txt

--2020-05-11 22:21:22--  https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/labels.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 225000 (220K) [text/plain]
Saving to: ‘labels.txt’


2020-05-11 22:21:23 (542 KB/s) - ‘labels.txt’ saved [225000/225000]



In [7]:
#! wget https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/reviews.txt

--2020-05-11 22:21:25--  https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33678267 (32M) [text/plain]
Saving to: ‘reviews.txt’


2020-05-11 22:27:19 (93.5 KB/s) - ‘reviews.txt’ saved [33678267/33678267]



Load in the data and visualize it

In [2]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()

with open('labels.txt', 'r') as f:
    labels = f.read()

print(reviews[:1000])
print()
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

Data Pre-processing

reviews are delimited with newline characters \n

In [3]:
# remove punctuation
reviews = reviews.lower()
all_text = "".join([c for c in reviews if c not in punctuation])

#split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = " ".join(reviews_split)

# create a list of all words
all_words = all_text.split()

In [4]:
print(all_words[:20])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such']


Encoding labels

In [5]:
# 1=positive, 0=negative
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == "positive" else 0 for label in labels_split])
print(encoded_labels[:3])

[1 0 1]


Removing Outliers

1. Getting rid of extremely long or short reviews; the outliers

2. Padding/truncating the remaining data so that we have reviews of the same length.

In [6]:
# build a dictionary that maps indices to review lengths
counts = Counter(all_words)

# outlier review stats; counting words in each review
review_lens = Counter([len(x.split()) for x in reviews_split])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [7]:
#remove any super short reviews and truncate super long reviews.
print("Number of reviews before removing outliers", len(reviews_split))

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_split) if len(review.split()) != 0]

# remove 0-length reviews and their labels
reviews_split = [reviews_split[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

# remove reviews larger than 1000 words and their labels
non_zero_idx = [ii for ii, review in enumerate(reviews_split) if len(review.split()) <= 1000]
reviews_split = [reviews_split[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print("Number of reviews after removing outliers:", len(reviews_split))

Number of reviews before removing outliers 25001
Number of reviews after removing outliers: 24855


Using a Pre-trained Embedding layer

In [22]:
# load a pretrained word2vec model
#! gzip d word2vec_model/GoogleNews-vectors-negative300-SLIM.bin.gz

gzip: d: No such file or directory
gzip: word2vec_model/GoogleNews-vectors-negative300-SLIM.bin.gz already has .gz suffix -- unchanged


In [8]:
# creating the model
embed_lookup = KeyedVectors.load_word2vec_format('word2vec_model/GoogleNews-vectors-negative300-SLIM.bin',
                                                 binary=True)

In [9]:
# store pretrained vocab
pretrained_words = []
for word in embed_lookup.vocab:
    pretrained_words.append(word)

In [10]:
row_idx = 1
# get word/embedding in that row
word = pretrained_words[row_idx]
embedding = embed_lookup[word]

print("Size of Vocab: {}".format(len(pretrained_words)))
print("Word in Vocab: {}".format(word))
print("Length of embedding {}".format(len(embedding)))

Size of Vocab: 299567
Word in Vocab: for
Length of embedding 300


In [11]:
# print a few common words
for i in range(5):
    print(pretrained_words[i])

in
for
that
is
on


In [12]:
# find similar words
# Select a word
find_similar_to = "fabulous"
print("Similar words to '{}'".format(find_similar_to))
for similar_word in embed_lookup.similar_by_word(find_similar_to):
    print("Word {}, Similarity: {:.3f}".format(similar_word[0], similar_word[1]))

Similar words to 'fabulous'
Word wonderful, Similarity: 0.761
Word fantastic, Similarity: 0.761
Word marvelous, Similarity: 0.730
Word gorgeous, Similarity: 0.714
Word lovely, Similarity: 0.713
Word terrific, Similarity: 0.694
Word amazing, Similarity: 0.693
Word beautiful, Similarity: 0.670
Word magnificent, Similarity: 0.667
Word splendid, Similarity: 0.645


Tokenize Reviews

In [13]:
# Convert reviews to tokens
def tokenize_all_reviews(embed_lookup, reviews_split):
    # split each review into a list of words
    reviews_words = [review.split() for review in reviews_split]

    tokenized_reviews = []
    for review in reviews_words:
        ints = []
        for word in review:
            try:
                idx = embed_lookup.vocab[word].index
            except:
                idx = 0
            ints.append(idx)
        tokenized_reviews.append(ints)
    return tokenized_reviews

In [14]:
tokenized_reviews = tokenize_all_reviews(embed_lookup, reviews_split)

In [18]:
print(reviews_split[0])
print(len(reviews_split[0].split()))
print(tokenized_reviews[0])
print(len(tokenized_reviews[0]))

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   
140
[0, 137, 3, 0, 11620, 3799, 13, 1215, 10, 9, 194, 54, 12, 73, 61, 685, 41, 183, 243, 129, 12, 1663, 119, 72, 0, 9, 2989, 7334, 242, 159, 0, 453, 2, 0, 137, 1239, 19951, 3, 141, 1980, 0, 1898

Padding sequences

Your final features array should be a 2D array, with as many rows as there are reviews,
and as many columns as the specified seq_length.

In [20]:
def pad_features(tokenized_reviews, seq_length):
    # padding at the beginning
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenized_reviews), seq_length), dtype=int)

    # for each review
    for i, row in enumerate(tokenized_reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]

    return features

In [23]:
seq_length = 200

features = pad_features(tokenized_reviews, seq_length=seq_length)

# Test
assert len(features)==len(tokenized_reviews), "Features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 5 values for the first 20
print(features[:20, :5])

[[     0      0      0      0      0]
 [     0      0      0      0      0]
 [ 16483     26      0     12 106210]
 [  1935   1326     12      0   1403]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [    56   4365      8    270    119]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     9    104   1428     16      0]
 [     0     25  13619  11902   7445]
 [     0      0      0      0      0]
 [     9    208  18994  66850 121241]
 [     0      0      0      0      0]
 [    38    165  66850 121241  13241]
 [     9    661      3    675     67]]


Training, validation and test data

In [24]:
split_frac = 0.8

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(19884, 200) 
Validation set: 	(2485, 200) 
Test set: 		(2486, 200)


Dataloaders and Batching

In [26]:
# Create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# shuffling and batching data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

Sentiment Network

In [27]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print("Training on GPU")
else:
    print("No GPU available. Training on CPU")

No GPU available. Training on CPU


In [29]:
class SentimentCNN(nn.Module):

    def __init__(self, embed_model, vocab_size, output_size, embedding_dim, num_filters=100,
                 kernel_sizes=[3,4,5], freeze_embeddings=True, drop_probab=0.5):
        super(SentimentCNN, self).__init__()
        self.num_filters = num_filters
        self.embedding_dim = embedding_dim

        #1. embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.from_numpy(embed_model.vectors))
        if freeze_embeddings:
            self.embedding.requires_grad = False

        #2. CNN Layers
        self.convs_1d = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim), padding=(k-2, 0))
            for k in kernel_sizes])

        #3. FC layer for classification
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_size)

        #4. Dropout and sigmoid layers
        self.dropout = nn.Dropout(drop_probab)
        self.sig = nn.Sigmoid()

    def conv_and_pool(self, x, conv):
        "Convolutional + max pooling layer"
        # Squeeze the last dimension to get size: (batch_size, num_filters, conv_seq_length)
        # conv_seq_length will be ~ 200
        x = F.relu(conv(x)).squeeze(3)

        # 1D pool over conv_seq_length
        # squeeze to get size: (batch_size, num_filters)
        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.unsqueeze(1)

        conv_results = [self.conv_and_pool(embeds, conv) for conv in self.convs_1d]

        x = torch.cat(conv_results, 1)
        x = self.dropout(x)

        logit = self.fc(x)

        return self.sig(logit)

Instantiate the network

In [30]:
# Instantiate the model w/ hyperparams

vocab_size = len(pretrained_words)
output_size = 1 # binary class (1 or 0)
embedding_dim = len(embed_lookup[pretrained_words[0]]) # 300-dim vectors
num_filters = 100
kernel_sizes = [3, 4, 5]

net = SentimentCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   num_filters, kernel_sizes)

print(net)

SentimentCNN(
  (embedding): Embedding(299567, 300)
  (convs_1d): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1), padding=(3, 0))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sig): Sigmoid()
)


Training

In [31]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [32]:
# training loop
def train(net, train_loader, epochs, print_every=100):

    # move model to GPU, if available
    if(train_on_gpu):
        net.cuda()

    counter = 0 # for printing

    # train for some number of epochs
    net.train()
    for e in range(epochs):

        # batch loop
        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output = net(inputs)

            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            optimizer.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_losses = []
                net.eval()
                for inputs, labels in valid_loader:

                    if(train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output = net(inputs)
                    val_loss = criterion(output.squeeze(), labels.float())

                    val_losses.append(val_loss.item())

                net.train()
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [33]:
# training params

epochs = 2 # this is approx where I noticed the validation loss stop decreasing
print_every = 100

train(net, train_loader, epochs, print_every=print_every)

Epoch: 1/2... Step: 100... Loss: 0.459185... Val Loss: 0.440422
Epoch: 1/2... Step: 200... Loss: 0.313580... Val Loss: 0.358592
Epoch: 1/2... Step: 300... Loss: 0.181484... Val Loss: 0.342895
Epoch: 2/2... Step: 400... Loss: 0.149747... Val Loss: 0.325604
Epoch: 2/2... Step: 500... Loss: 0.348554... Val Loss: 0.335780
Epoch: 2/2... Step: 600... Loss: 0.205872... Val Loss: 0.344993
Epoch: 2/2... Step: 700... Loss: 0.422730... Val Loss: 0.347651


Testing

In [34]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0


net.eval()
# iterate over test data
for inputs, labels in test_loader:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output = net(inputs)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.386
Test accuracy: 0.844


Inference on a test review

In [36]:
#from string import punctuation

# helper function to process and tokenize a single review
def tokenize_review(embed_lookup, test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    tokenized_review = []
    for word in test_words:
        try:
            idx = embed_lookup.vocab[word].index
        except:
            idx = 0
        tokenized_review.append(idx)

    return tokenized_review

In [35]:
def predict(embed_lookup, net, test_review, sequence_length=200):
    """
    Predict whether a given test_review has negative or positive sentiment.
    """

    net.eval()

    # tokenize review
    test_ints = tokenize_review(embed_lookup, test_review)

    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features([test_ints], seq_length)

    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)

    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()

    # get the output from the model
    output = net(feature_tensor)

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

Test on positive/negative reviews

In [37]:
seq_length=200 # good to use the length that was trained on

In [38]:
# negative test review
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'

# test negative review
predict(embed_lookup, net, test_review_neg, seq_length)

Prediction value, pre-rounding: 0.000704
Negative review detected.


In [39]:
# positive test review
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'

predict(embed_lookup, net, test_review_pos, seq_length)

Prediction value, pre-rounding: 0.999048
Positive review detected!
