In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BiLSTMSentiment(nn.Module):

    def __init__(self, embed_model, vocab_size, label_size, embedding_dim, hidden_dim, batch_size, seq_length, dropout=0.5, freeze_embeddings = True):
        super(BiLSTMSentiment, self).__init__()
        # set class vars
        self.embedding_dim = embedding_dim
        
        # 1. embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # set weights to pre-trained
        self.embeddings.weight = nn.Parameter(torch.from_numpy(embed_model.vectors)) # all vectors
        # (optional) freeze embedding weights
        if freeze_embeddings:
            self.embeddings.requires_grad = False

        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.dropout = dropout

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(batch_first=True, input_size=embedding_dim, num_layers =2, hidden_size=hidden_dim, bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*2, label_size)

    def forward(self, sentence):
        x = self.embeddings(sentence)
        _, (lstm_out, _) = self.lstm(x)
        logits = self.hidden2label(lstm_out[-1])
        # log_probs = F.log_softmax(logits)
        return logits

In [5]:
import numpy as np
def pad_features(tokenized_text, seq_length):
    ''' Return features of tokenized_reviews, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenized_text), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(tokenized_text):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

# convert reviews to tokens
def tokenize_all_text(embed_lookup, data):
    # split each review into a list of words
    words = [item['text'].split() for item in data.values()]

    tokenized_text = []
    for text in words:
        ints = []
        for word in text:
            try:
                idx = embed_lookup.key_to_index[word]
            except: 
                idx = 0
            ints.append(idx)
        tokenized_text.append(ints)
    
    return tokenized_text

# import Word2Vec loading capabilities
from gensim.models import KeyedVectors

# Creating the model
embed_lookup = KeyedVectors.load_word2vec_format('word2vec_model/GoogleNews-vectors-negative300-SLIM.bin', 
                                                 binary=True)



KeyboardInterrupt: 

In [None]:
# LREC
import json
with open("../preprocess/lrec_split.json") as f:
    lrec_data = json.load(f)

lrec_tokenized_text = tokenize_all_text(embed_lookup, lrec_data['train'])
lrec_test_tokenized_text = tokenize_all_text(embed_lookup, lrec_data['test'])

# Test your implementation!

seq_length = 15

lrec_train_features = pad_features(lrec_tokenized_text, seq_length=seq_length)

## test statements - do not change - ##
assert len(lrec_train_features)==len(lrec_tokenized_text), "Features should have as many rows as reviews."
assert len(lrec_train_features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 8 values of the first 20 batches 
print(lrec_train_features[:20,:8])

# Test your implementation!

lrec_test_features = pad_features(lrec_test_tokenized_text, seq_length=seq_length)

## test statements - do not change - ##
assert len(lrec_test_features)==len(lrec_test_tokenized_text), "Features should have as many rows as reviews."
assert len(lrec_test_features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 8 values of the first 20 batches 
print(lrec_test_features[:20,:8])


lrec_train_labels = np.array([item['label'] for item in lrec_data['train'].values()])
lrec_test_labels = np.array([item['label'] for item in lrec_data['test'].values()])

print(lrec_test_labels[:20])

split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(lrec_train_features)*split_frac)
lrec_train_x, lrec_valid_x = lrec_train_features[:split_idx], lrec_train_features[split_idx:]
lrec_train_y, lrec_valid_y = lrec_train_labels[:split_idx], lrec_train_labels[split_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(lrec_train_x.shape), 
      "\nValidation set: \t{}".format(lrec_valid_x.shape),
      "\nTest set: \t\t{}".format(lrec_test_features.shape))

# create Tensor datasets
lrec_train_data = TensorDataset(torch.from_numpy(lrec_train_x), torch.from_numpy(lrec_train_y))
lrec_valid_data = TensorDataset(torch.from_numpy(lrec_valid_x), torch.from_numpy(lrec_valid_y))
lrec_test_data = TensorDataset(torch.from_numpy(lrec_test_features), torch.from_numpy(lrec_test_labels))

# dataloaders
batch_size = 4

# shuffling and batching data
lrec_train_loader = DataLoader(lrec_train_data, shuffle=True, batch_size=batch_size)
lrec_valid_loader = DataLoader(lrec_valid_data, shuffle=True, batch_size=batch_size)
lrec_test_loader = DataLoader(lrec_test_data, shuffle=True, batch_size=batch_size)