In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

import numpy as np

import pandas as pd

from collections import Counter

torch.set_printoptions(sci_mode = False)

In [3]:
# Extract data
reddit_df = pd.read_csv("../data/Reddit_Data.csv")
twitter_df = pd.read_csv("../data/Twitter_Data.csv")

In [4]:
# Inspect Reddit data
reddit_df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [5]:
# Inspect Twitter data
twitter_df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [6]:
# Get rid of NaN columns
reddit_df = reddit_df.dropna()
twitter_df = twitter_df.dropna()

In [7]:
# Get a list of comments
comments = []

In [8]:
for i in range(len(reddit_df)):
    comments.append(str(reddit_df.iloc[i][0]).lower())
    
for i in range(len(twitter_df)):
    comments.append(str(twitter_df.iloc[i][0]).lower())

In [9]:
len(comments)

200118

In [10]:
# This is where all the words will stay
all_words = ""

In [11]:
for string in comments:
    all_words += string

In [12]:
# Count occurences of each word
all_words = all_words.split(" ")
count_words = Counter(all_words)

In [13]:
# Sort them by the most common
total_words = len(all_words)
sorted_words = count_words.most_common(total_words)

In [14]:
count_words

Counter({'': 17229,
         'family': 3148,
         'mormon': 1,
         'have': 27635,
         'never': 5040,
         'tried': 711,
         'explain': 477,
         'them': 8119,
         'they': 22596,
         'still': 4349,
         'stare': 8,
         'puzzled': 5,
         'from': 21034,
         'time': 8988,
         'like': 17373,
         'some': 6863,
         'kind': 1408,
         'strange': 108,
         'creature': 24,
         'nonetheless': 19,
         'come': 4627,
         'admire': 105,
         'for': 64714,
         'the': 159020,
         'patience': 68,
         'calmness': 3,
         'equanimity': 7,
         'acceptance': 60,
         'and': 94821,
         'compassion': 40,
         'developed': 574,
         'all': 21911,
         'things': 2757,
         'buddhism': 95,
         'teaches': 36,
         'has': 24906,
         'very': 5474,
         'much': 5135,
         'lot': 2629,
         'compatible': 14,
         'with': 28483,
         'chris

In [15]:
vocab_to_int = {word : i + 1 for i, (word, count) in enumerate(sorted_words)}

In [47]:
np.save("../data/vocab_to_int", vocab_to_int)

In [16]:
# Encode comments
comments_int = []
for comment in comments:
    try:
        result = [vocab_to_int[word] for word in comment.split(" ")]
    except:
        pass
    comments_int.append(result)

In [17]:
# Labels (-1 = NEGATIVE; 0 = NEUTRAL; 1 = POSITIVE)
ys = []

In [18]:
for i in range(len(reddit_df)):
    ys.append(int(reddit_df.iloc[i][1]))

for i in range(len(twitter_df)):
    ys.append(int(twitter_df.iloc[i][1]))

In [19]:
ys[:10]

[1, 1, -1, 0, 1, -1, 1, 0, -1, 1]

In [20]:
ys = np.array(ys)

In [21]:
# If comments are shorted than a given sequence
# length, pad with zeros
# If comments are longer than a given sequence
# length, truncate them
def pad_features(comments_int, seq_length):
    # xs = Matrix(len(comments_int) X seq_length)
    xs = np.zeros((len(comments_int), seq_length), dtype = int)
    
    # For each indexed comment
    for i, comment in enumerate(comments_int):
        comment_len = len(comment)
        
        # If the comment length is <= the sequence length
        if comment_len <= seq_length:
            # Pad with zeros
            zeros = list(np.zeros(seq_length - comment_len))
            new = zeros + comment
        
        # Otherwise, truncate
        elif comment_len > seq_length:
            new = comment[0:seq_length]
            
        # Populate the features array
        xs[i][:] = np.array(new)
        
    return xs

In [22]:
seq_length = 250
xs = pad_features(comments_int, seq_length)

In [23]:
# Preparing data for training and testing

In [24]:
split_percentage = 0.7
x_train = xs[0 : int(split_percentage * len(xs))]
y_train = ys[0 : int(split_percentage * len(ys))]

In [25]:
remaining_xs = xs[int(split_percentage * len(xs)):]
remaining_ys = ys[int(split_percentage * len(ys)):]

In [26]:
x_valid = remaining_xs[0 : int(len(remaining_xs) * 0.5)]
y_valid = remaining_ys[0 : int(len(remaining_ys) * 0.5)]

In [27]:
x_test = remaining_xs[int(len(remaining_xs) * 0.5):]
y_test = remaining_ys[int(len(remaining_ys) * 0.5):]

In [28]:
# Get dataloaders ready for training and testing
train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_valid), torch.from_numpy(y_valid))
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

In [29]:
BS = 32

In [30]:
train_loader = DataLoader(train_data, shuffle = True, batch_size = BS)
valid_loader = DataLoader(valid_data, shuffle = True, batch_size = BS)
test_loader = DataLoader(test_data, shuffle = True, batch_size = BS)

In [35]:
# MODEL
class SentimentAnalyzer(nn.Module):
    def __init__(
        self,
        vocab_size,
        output_size,
        embedding_dim,
        hidden_dim,
        n_layers
    ):
        super(SentimentAnalyzer, self).__init__()
        
        # Output layer size
        self.output_size = output_size
        
        # Number of LSTM layers
        self.n_layers = n_layers
        
        # Hidden layer dimension of LSTM cell
        self.hidden_dim = hidden_dim
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = -1)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            dropout = 0.5,
            batch_first = True
        )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        
        # Xs are going to be of size BATCH_SIZE x SEQ_LEN
        # We are interested in the batch size, so x.size(0)
        # will give us that
        batch_size = x.size(0)
        
        # Output from the embedding layer
        embeds = self.embedding(x)
        
        # Output and new hidden layer from
        # LSTM layer
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # We'll convert that LSTM output into a contiguous
        # tensor. This prevents certain runtime errors
        # from occuring when certain operations happen
        # (like getting a view of a transposed tensor)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Output after dropping some neurons with
        # 30% dropout rate
        out = F.dropout(lstm_out, 0.3)
        
        # Output of fully connected layer
        out = self.fc(out)
        
        # Our labels are -1, 0 and 1
        # Tanh(x) belongs to the interval [-1, 1]
        tanh_out = F.tanh(out)
        tanh_out = tanh_out.view(batch_size, -1)
        tanh_out = tanh_out[:,-1]
        
        return tanh_out, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), 
                 weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        return hidden

In [40]:
# Instantiate model and hyperparams
vocab_size = len(vocab_to_int) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = SentimentAnalyzer(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model = model.cuda()
print(model)

SentimentAnalyzer(
  (embedding): Embedding(218602, 400, padding_idx=218601)
  (lstm): LSTM(400, 32, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [43]:
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr = lr)

epochs = 4

cnt = 0
print_every = 100
gradient_clip = 5

torch.backends.cudnn.benchmark = True

model.train()
for epoch in range(epochs):
    # Initialize hidden state with 
    # initial model weights and biases
    # Needed because LSTMs are recurrent networks,
    # Meaning that each output produces a new
    # hidden state, which will be passed though a
    # new cell / layer, which will repeat the process.
    hidden_state = model.init_hidden(BS)
    
    # Loop on batch
    for x, y in train_loader:
        cnt += 1
        x, y = x.cuda(), y.cuda()
        hidden_state = tuple([state.data for state in hidden_state])
        
        model.zero_grad()
        
        x = x.type(torch.LongTensor)
        x = x.cuda()
        out, hidden_state = model(x, hidden_state)
        loss = F.binary_cross_entropy(out.squeeze(), y.float())
        loss.backward()
        
        # RNNs and LSTMs have an issue where the gradient will
        # explode A.K.A they will get so big or so small, to
        # the point where they will overflow the data type's allocated
        # (e.g. the would get over the max integer value - 2,147,483,647 - if the gradient
        # was an integer)
        # PyTorch has a nice util called "clip_grad_norm" which can help
        # prevent this issue
        nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()
        
        if (cnt - 1) % print_every == 0:
            # Get validation loss
            # Same spiel as above, except this is for validation
            val_h_s = model.init_hidden(BS)
            val_losses = []
            model.eval()
            for x_val, y_val in valid_loader:
                val_h_s = tuple([state.data for state in val_h_s])
                
                x_val, y_val = x_val.cuda(), y_val.cuda()
                
                x_val = x_val.type(torch.LongTensor)
                out, val_h_s = model(x_val, val_h_s)
                val_loss = F.binary_cross_entropy_with_logits(out.cpu().squeeze(), y_val.cpu().float())
                
                val_losses.append(val_loss.item())
            
            model.train()
            
            print(f"Epoch: {epoch + 1}/{epochs}",
                  f"Step: {cnt}",
                  f"Loss: {loss.item():.6f}",
                  f"Val Loss: {np.mean(val_losses):.6f}",
                   "Saving...")
            torch.save(model, "../models/SentimentAnalyzer.pth")

RuntimeError: CUDA error: device-side assert triggered