In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from string import punctuation

from torch.utils.data import DataLoader, TensorDataset

import numpy as np

import pandas as pd

from collections import Counter

torch.set_printoptions(sci_mode = False)

In [None]:
# Extract data
reddit_df = pd.read_csv("../data/Reddit_Data.csv")
twitter_df = pd.read_csv("../data/Twitter_Data.csv")
imdb_df = pd.read_csv("../data/IMDB Dataset.csv")

In [None]:
# Inspect Reddit data
reddit_df.head()

In [None]:
# Inspect Twitter data
twitter_df.head()

In [None]:
# Inspect IMDB data
imdb_df.head()

In [None]:
# Get rid of NaN columns
reddit_df = reddit_df.dropna()
twitter_df = twitter_df.dropna()
imdb_df = imdb_df.dropna()

In [None]:
# Clean up IMDB data
for i in range(len(imdb_df)):
    if imdb_df.iloc[i][1] == "positive":
        imdb_df.iloc[i][1] = 1
    else:
        imdb_df.iloc[i][1] = -1

    text = "".join([char for char in imdb_df.iloc[i][0] if char not in punctuation]).lower()
    imdb_df.iloc[i][0] = text;

In [None]:
# Inspect IMDB data again
imdb_df.head()

In [None]:
# Get a list of comments
comments = []

In [None]:
for i in range(len(reddit_df)):
    comments.append(str(reddit_df.iloc[i][0]).lower())
  
for i in range(len(imdb_df)):
    comments.append(str(imdb_df.iloc[i][0]))

for i in range(len(twitter_df)):
    comments.append(str(twitter_df.iloc[i][0]).lower())

In [None]:
len(comments)

In [None]:
# This is where all the words will stay
all_words = ""

In [None]:
for string in comments:
    all_words += string

In [None]:
# Count occurences of each word
all_words = all_words.split()
count_words = Counter(all_words)

In [None]:
# Sort them by the most common
total_words = len(all_words)
sorted_words = count_words.most_common(total_words)

In [None]:
len(count_words)

In [None]:
vocab_to_int = {word : i + 1 for i, (word, count) in enumerate(sorted_words)}

In [None]:
np.save("../data/vocab_to_int", vocab_to_int)

In [None]:
# Encode comments
comments_int = []
for comment in comments:
    try:
        result = [vocab_to_int[word] for word in comment.split()]
    except:
        pass
    comments_int.append(result)

In [None]:
# Labels (0 = NEGATIVE; 0.5 = NEUTRAL; 1 = POSITIVE)
ys = []

In [None]:
for i in range(len(reddit_df)):
    ys.append((int(reddit_df.iloc[i][1]) + 1) / 2)

for i in range(len(imdb_df)):
    ys.append((int(imdb_df.iloc[i][1]) + 1) / 2)

for i in range(len(twitter_df)):
    ys.append((int(twitter_df.iloc[i][1]) + 1) / 2)

In [None]:
ys[:10]

In [None]:
ys = np.array(ys)

In [None]:
# If comments are shorted than a given sequence
# length, pad with zeros
# If comments are longer than a given sequence
# length, truncate them
def pad_features(comments_int, seq_length):
    # xs = Matrix(len(comments_int) X seq_length)
    xs = np.zeros((len(comments_int), seq_length), dtype = int)
    
    # For each indexed comment
    for i, comment in enumerate(comments_int):
        comment_len = len(comment)
        
        # If the comment length is <= the sequence length
        if comment_len <= seq_length:
            # Pad with zeros
            zeros = list(np.zeros(seq_length - comment_len))
            new = zeros + comment
        
        # Otherwise, truncate
        elif comment_len > seq_length:
            new = comment[0:seq_length]
            
        # Populate the features array
        xs[i][:] = np.array(new)
        
    return xs

In [None]:
seq_length = 150
xs = pad_features(comments_int, seq_length)

In [None]:
# Preparing data for training and testing

In [None]:
split_percentage = 0.7
x_train = xs[0 : int(split_percentage * len(xs))]
y_train = ys[0 : int(split_percentage * len(ys))]

In [None]:
x_valid = xs[int(split_percentage * len(xs)):]
y_valid = ys[int(split_percentage * len(ys)):]

In [None]:
# Get dataloaders ready for training and testing
valid_data = TensorDataset(torch.from_numpy(x_valid), torch.from_numpy(y_valid))
train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))

In [None]:
BS = 50

In [None]:
train_loader = DataLoader(train_data, shuffle = True, batch_size = BS, drop_last = True)
valid_loader = DataLoader(valid_data, shuffle = True, batch_size = BS, drop_last = True)

In [None]:
# MODEL
class SentimentAnalyzer(nn.Module):
    def __init__(
        self,
        vocab_size,
        output_size,
        embedding_dim,
        hidden_dim,
        n_layers
    ):
        super(SentimentAnalyzer, self).__init__()
        
        # Output layer size
        self.output_size = output_size
        
        # Number of LSTM layers
        self.n_layers = n_layers
        
        # Hidden layer dimension of LSTM cell
        self.hidden_dim = hidden_dim
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            dropout = 0.5,
            batch_first = True
        )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        
        # Xs are going to be of size BATCH_SIZE x SEQ_LEN
        # We are interested in the batch size, so x.size(0)
        # will give us that
        batch_size = x.size(0)
        
        # Output from the embedding layer
        embeds = self.embedding(x)
        
        # Output and new hidden layer from
        # LSTM layer
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # We'll convert that LSTM output into a contiguous
        # tensor. This prevents certain runtime errors
        # from occuring when certain operations happen
        # (like getting a view of a transposed tensor)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Output after dropping some neurons with
        # 30% dropout rate
        out = F.dropout(lstm_out, 0.3)
        
        # Output of fully connected layer
        out = self.fc(out)
        
        # Our labels are 0, 0.5 and 1
        # sigmoid(x) belongs to the interval [-1, 1]
        sig_out = F.sigmoid(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:,-1]
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), 
                 weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        return hidden

In [None]:
# Instantiate model and hyperparams
vocab_size = len(vocab_to_int) + 1
output_size = 1
embedding_dim = 200
hidden_dim = 64
n_layers = 2

model = SentimentAnalyzer(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model = model.cuda()
print(model)

In [None]:
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr = lr)

epochs = 4

cnt = 0
print_every = 100
gradient_clip = 5

torch.backends.cudnn.benchmark = True

model.train()
for epoch in range(epochs):
    # Initialize hidden state with 
    # initial model weights and biases
    # Needed because LSTMs are recurrent networks,
    # Meaning that each output produces a new
    # hidden state, which will be passed though a
    # new cell / layer, which will repeat the process.
    hidden_state = model.init_hidden(BS)
    
    # Loop on batch
    for x, y in train_loader:
        cnt += 1
        x, y = x.cuda(), y.cuda()
        hidden_state = tuple([state.data for state in hidden_state])
        
        model.zero_grad()
        
        x = x.type(torch.cuda.LongTensor)

        out, hidden_state = model(x, hidden_state)
        loss = F.binary_cross_entropy(out.squeeze(), y.float())
        loss.backward()
        
        # print(f"LOSS: {loss.item()} | FIRST 5: {out[:5]}")

        # RNNs and LSTMs have an issue where the gradient will
        # explode A.K.A they will get so big or so small, to
        # the point where they will overflow the data type's allocated
        # (e.g. the would get over the max integer value - 2,147,483,647 - if the gradient
        # was an integer)
        # PyTorch has a nice util called "clip_grad_norm" which can help
        # prevent this issue
        nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()
        
        if (cnt - 1) % print_every == 0:
            # Get validation loss
            # Same spiel as above, except this is for validation
            val_h_s = model.init_hidden(BS)
            val_losses = []
            model.eval()
            for x_val, y_val in valid_loader:
                val_h_s = tuple([state.data for state in val_h_s])
                
                x_val, y_val = x_val.cuda(), y_val.cuda()
                
                x_val = x_val.type(torch.cuda.LongTensor)
                
                out, val_h_s = model(x_val, val_h_s)

                val_loss = F.binary_cross_entropy(out.squeeze(), y_val.float())
                
                val_losses.append(val_loss.item())
            
            model.train()
            
            print(f"Epoch: {epoch + 1}/{epochs}",
                  f"Step: {cnt}",
                  f"Loss: {loss.item():.6f}",
                  f"Val Loss: {np.mean(val_losses):.6f}",
                   "Saving...")
            torch.save(model, "../models/SentimentAnalyzer.pth")