<div style="line-height:1.2;">

<h1 style="color:#BF66F2; margin-bottom: 0.3em;"> Natural Language Processing in PyTorch 1 </h1>

<h4 style="margin-top: 0.3em; margin-bottom: 1em;"> Simple Sentimental analysis. </h4>

<div style="line-height:1.4; margin-bottom: 0.5em;">
    <h3 style="color: lightblue; display: inline; margin-right: 0.5em;">Keywords:</h3> Counter + maketrans() + nn.Embedding() + 'UNK' token + with warnings.catch_warnings() + squeeze()
</div>

</div>

In [18]:
import torch
import warnings
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import string


In [30]:
data = [
    ("The Marvel cinematic universe is incredibly well-built", 1),
    ("Spiderman's character arc in the movies is fascinating", 1),
    ("The Avengers series brings together amazing superheroes", 1),
    ("Iron Man's suit technology is intriguing and fun to watch", 1),
    ("Batman's dark demeanor and backstory are so engaging", 1),
    ("The Joker in The Dark Knight was portrayed phenomenally", 1),
    ("Wonder Woman's character is a true symbol of empowerment", 1),
    ("Black Panther represents both strength and leadership", 1),
    ("The X-men series did a great job handling multiple characters", 1),
    ("The Flash's time travel episodes are always interesting", 1),
    ("Thor’s transformation in the Marvel movies is really admirable", 1),
    ("The concept of multiverse in Dr. Strange is mind-blowing", 1),
    ("Ant-Man brought a fun and lighter tone to the Marvel universe", 1),
    ("The teamwork of the Justice League is wonderful", 1),
    ("Captain America’s dedication towards his duty is inspirational", 1),
    ("The action scenes in Aquaman were spectacular", 1),
    ("Guardians of the Galaxy has a perfect blend of humor and action", 1),
    ("The alliance of superheroes in Avengers: Endgame was epic", 1),
    ("Superman's moral compass is something to admire", 1),
    ("Venom brought an interesting perspective to a villain’s story", 1),
    ("I love how Marvel subtly connects all its movies and characters", 1),
    ("The emotional depth in Logan was surprisingly touching", 1),
    ("The rivalry between Professor X and Magneto is classic", 1),
    ("The animation and storyline in Spiderman: Into the Spider-Verse were amazing", 1),
    ("The heroic acts of the superheroes are truly motivational", 1),
    
    ("I did not enjoy the plot twists in the recent Marvel movie", 0),
    ("The character development in DC movies is often lacking", 0),
    ("The latest Batman movie did not live up to the hype", 0),
    ("I am not a fan of how the Flash’s storyline is progressing", 0),
    ("The reboot of the Spiderman series was totally unnecessary", 0),
    ("The final battle in the Avengers felt rushed and chaotic", 0),
    ("I think Superman's character is too overpowered and unrelatable", 0),
    ("The villains in the DC universe are often underdeveloped", 0),
    ("I find the humor in Guardians of the Galaxy forced and dry", 0),
    ("Marvel movies often compromise story depth for visual effects", 0),
    ("The dark tone of DC movies does not appeal to me", 0),
    ("I think the Marvel cinematic universe is too commercialized", 0),
    ("The Justice League movie did not do justice to the characters", 0),
    ("Wonder Woman’s sequel was not as good as the first movie", 0),
    ("I get bored with the repetitive story arcs in superhero movies", 0),
    ("The character of Hulk is not explored well in the movies", 0),
    ("Batman vs Superman was a letdown considering the expectations", 0),
    ("The plot of the latest DC movie was too predictable", 0),
    ("I feel like Thor’s character is often used just for comic relief", 0),
    ("The last Avengers movie was way too long and tiring", 0),
    ("I did not like the portrayal of Lex Luthor in Batman vs Superman", 0),
    ("The continuity errors in the Marvel universe are too evident", 0),
    ("The death and revival of characters in comics is too repetitive", 0),
    ("The twist with Mandarin in Iron Man 3 was disappointing", 0),
    ("DC should stick to animated series instead of movies", 0),
]


In [31]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=38)

In [32]:
def tokenize(text):
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    return text.split()

word_counts = Counter(tokenize(" ".join([t[0] for t in train_data])))
vocab = set(word_counts.keys())
vocab_to_int = {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}
int_to_vocab = {idx: word for word, idx in vocab_to_int.items()}


In [33]:
""" Add the '<UNK>' token to your vocab_to_int to handle words of the validation set that are not present in the training set.
"""
print('<UNK>' in vocab_to_int)
vocab_to_int['<UNK>'] = 0 

False


In [34]:
class TextDataset(Dataset):
    def __init__(self, data, vocab_to_int, max_len):
        """ Initialize the TextDataset instance.

        Args:
            - Input data [list of tuples]
            - Dictionary that converts words to integers [dict]
            - Maximum length to which sequences should be padded or truncated [int]
        """
        self.data = data
        self.vocab_to_int = vocab_to_int
        self.max_len = max_len
        
    def __len__(self):
        """ Return the size of the data. """
        return len(self.data)
    
    def __getitem__(self, idx):
        """ Return the tokenized and numericalized text and label for the given index.

        Parameters:
            Index for which data is to be returned [int]

        Details: 
            Add the check of words that are not in "vocab_to_int" dictionary.\\
            One common approach is to use a special <UNK> (unknown) token to represent all unknown words. 

        Returns:
            Numericalized text and label [torch.Tensor, torch.Tensor]
        """
        text, label = self.data[idx]
        tokenized = tokenize(text)
        #numericalized = [self.vocab_to_int[word] for word in tokenized]
        numericalized = [self.vocab_to_int.get(word, self.vocab_to_int['<UNK>']) for word in tokenized]
        # Add padding
        pad_len = self.max_len - len(numericalized)
        numericalized += [0] * pad_len  #with 0 is the index for the PAD token
        return torch.tensor(numericalized), torch.tensor(label)


In [35]:
train_dataset = TextDataset(train_data, vocab_to_int, 50)
val_dataset = TextDataset(val_data, vocab_to_int, 50)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

In [36]:
for i, (text, label) in enumerate(train_loader): 
    print(text.shape) 

torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])
torch.Size([2, 50])


In [37]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)

model = SentimentModel(len(vocab), 10, 20, 1)
model

SentimentModel(
  (embedding): Embedding(199, 10)
  (rnn): RNN(10, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [38]:
""" Training """
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10

# Training
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)

    for epoch in range(epochs):
        for texts, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(texts).squeeze()
            loss = criterion(outputs.float(), labels.float())
            loss.backward()
            optimizer.step()
            
        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 0.6488404273986816
Epoch: 2, Loss: 0.6950567960739136
Epoch: 3, Loss: 0.6938516497612
Epoch: 4, Loss: 0.6952284574508667
Epoch: 5, Loss: 0.7652314305305481
Epoch: 6, Loss: 0.6931604146957397
Epoch: 7, Loss: 0.6941854953765869
Epoch: 8, Loss: 0.6795384883880615
Epoch: 9, Loss: 0.6945673227310181
Epoch: 10, Loss: 0.6962963938713074


In [39]:
""" Validation """
model.eval()
total_correct = 0
total_count = 0

with torch.no_grad():
    for texts, labels in val_loader:
        outputs = model(texts).squeeze()
        predicted = torch.round(outputs)
        total_correct += (predicted == labels).sum().item()
        total_count += labels.size(0)
        
accuracy = total_correct / total_count
print("Validation Accuracy: {:.2f}%".format(accuracy*100))

Validation Accuracy: 40.00%
