<a href="https://colab.research.google.com/github/jacobpaul07/Delivery_app_android/blob/master/Upgraded_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torchtext import data
from torchtext import datasets
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
print(TEXT)

<torchtext.data.field.Field object at 0x7f7380317c88>


In [2]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 10.7MB/s]


In [0]:
import random
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [4]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:31, 2.20MB/s]                           
100%|█████████▉| 399116/400000 [00:22<00:00, 18851.58it/s]

In [0]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [0]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) 
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [9]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [10]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1640, -0.1446,  0.6948,  ..., -0.4320, -0.1155,  0.9253],
        [-0.9153,  0.4699, -0.6548,  ..., -0.2684, -0.3351, -0.2960],
        [ 0.3461, -0.0135, -0.1925,  ...,  0.5472, -0.2596,  0.3109]])

In [11]:
 UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1640, -0.1446,  0.6948,  ..., -0.4320, -0.1155,  0.9253],
        [-0.9153,  0.4699, -0.6548,  ..., -0.2684, -0.3351, -0.2960],
        [ 0.3461, -0.0135, -0.1925,  ...,  0.5472, -0.2596,  0.3109]])


In [0]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [13]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

100%|█████████▉| 399116/400000 [00:39<00:00, 18851.58it/s]

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval() 
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 40s
	Train Loss: 0.635 | Train Acc: 62.53%
	 Val. Loss: 0.933 |  Val. Acc: 50.09%
Epoch: 02 | Epoch Time: 1m 39s
	Train Loss: 0.647 | Train Acc: 62.54%
	 Val. Loss: 0.689 |  Val. Acc: 59.55%
Epoch: 03 | Epoch Time: 1m 39s
	Train Loss: 0.607 | Train Acc: 66.10%
	 Val. Loss: 0.427 |  Val. Acc: 81.80%
Epoch: 04 | Epoch Time: 1m 39s
	Train Loss: 0.423 | Train Acc: 81.21%
	 Val. Loss: 0.315 |  Val. Acc: 87.12%
Epoch: 05 | Epoch Time: 1m 39s
	Train Loss: 0.318 | Train Acc: 87.32%
	 Val. Loss: 0.286 |  Val. Acc: 88.71%


In [19]:
model.load_state_dict(torch.load('tut2-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.318 | Test Acc: 87.29%


In [0]:
import spacy
nlp = spacy.load('en')
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor)) 
    return prediction.item()

MOVIE REVIEW : THE INVISIBLE MAN

In [21]:
predict_sentiment(model, 
    """              
        A reboot of ‘The Invisible Man’ film series, based on the sci-fi novel by H. G. Wells, 
        it becomes immediately evident that this version doesn’t choose any short cuts. 
        The first two acts are a slow buildup, allowing us to understand why Cecilia was so afraid of Adrian.
        The dialogue also reveals details without the need for lengthy exposition or flashbacks.
        There are no dream sequences, fake-outs, and none of the other cheap narrative tricks most thrillers prefer.
        The jump scares are well earned, and come out of the blue, exactly how they should be. 
        A lot is happening in plain sight, but rather than showing you where to look, writer and director Leigh Whannell  
        uses intelligent framing to accommodate the person in the room who isn’t there. Or, is he? This question lingers in your mind throughout the film 
        and doesn’t let go until the climax. Whannell is also adept at fight sequences, so when the film does amp up the action,
       the camerawork and choreography are easy to follow, and add to the suspenseful plot rather than distract from it.      
    """            
      )

0.26839983463287354

MOVIE REVIEW : KANNUM KANNUM KOLLAIYADITHAL 

In [22]:
predict_sentiment(model, 
        """
       Kannum Kannum Kollaiyadithaal is a winsome romantic thriller with charming leads and edge-of-the-seat moments.
       Desingh Periyasamy displays a flair for this material both in his writing and making.
       Though his premise isn't new, he manages to inject freshness into the scenes with clever writing. 
       Take Kallis. He initially seems like just another wisecracking friend that we come across in our films, but the director gradually turns him 
       into a parallel lead who is used to effectively lighten up the heavier scenes. Even a breakdown scene of this character
       is filled with the right amount of humour to ensure that things don't turn too serious.
       And when the film turns into a proper heist movie in the second half, the shift feels seamless.
       The heist portions, too, have the right amount of thrills and laughs. The technical team ensures that the breezy mood sustains.
       Masala Coffee and Harshvardhan Rameshwar's songs and score are peppy while KM Bhaskaran provides the richness that the script deserves with his visuals.
       In fact, the longer running time doesn't feel like an issue at all here, though the first 15-20 minutes are rather underwhelming.
    
       """
       )        


0.9138174057006836

In [29]:
predict_sentiment(model,
                  """
                  best
                  """
                  )

0.9654898047447205