# Group Exercise: Implementing Simple BOW, CNN, RNN's

After completing our individual notebooks, the team members of Reign are sitting down together to teach eachother the implementation skills that we have picked up for Bag of Words, Convolutional Neural Nets, and Recursive Neural Nets. Since we are teaching eachother, we have decided to keep the implementation dataset small and concise for easier understanding of the model manipulations. https://github.com/graykode/nlp-tutorial was a good tutorial to follow as a backbone to us teaching eachother on these simple models. 

### CNN

In [1]:
#CNN
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.num_filters_total = num_filters * len(filter_sizes)
        self.W = nn.Embedding(vocab_size, embedding_size)
        self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)
        self.Bias = nn.Parameter(torch.ones([num_classes]))
        self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters, (size, embedding_size)) for size in filter_sizes])

    def forward(self, X):
        embedded_chars = self.W(X) # [batch_size, sequence_length, sequence_length]
        embedded_chars = embedded_chars.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]

        pooled_outputs = []
        for i, conv in enumerate(self.filter_list):
            # conv : [input_channel(=1), output_channel(=3), (filter_height, filter_width), bias_option]
            h = F.relu(conv(embedded_chars))
            # mp : ((filter_height, filter_width))
            mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1))
            # pooled : [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3)]
            pooled = mp(h).permute(0, 3, 2, 1)
            pooled_outputs.append(pooled)

        h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3) * 3]
        h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total]) # [batch_size(=6), output_height * output_width * (output_channel * 3)]
        model = self.Weight(h_pool_flat) + self.Bias # [batch_size, num_classes]
        return model

if __name__ == '__main__':
    embedding_size = 2 # embedding size
    sequence_length = 3 # sequence length
    num_classes = 2 # number of classes
    filter_sizes = [2, 2, 2] # n-gram windows
    num_filters = 3 # number of filters

    # 3 words sentences (=sequence_length is 3)
    sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
    labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    vocab_size = len(word_dict)

    model = TextCNN()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])
    targets = torch.LongTensor([out for out in labels]) # To using Torch Softmax Loss function

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()
        output = model(inputs)

        # output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
        loss = criterion(output, targets)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Test
    test_text = 'sorry hate you'
    tests = [np.asarray([word_dict[n] for n in test_text.split()])]
    test_batch = torch.LongTensor(tests)

    # Predict
    predict = model(test_batch).data.max(1, keepdim=True)[1]
    print(predict)
    if predict[0][0] == 0:
        print(test_text,"is Bad Mean...")
    else:
        print(test_text,"is Good Mean!!")
        
    import math as m
    print('Confidence: ',model(test_batch).data[0][0].item()/m.sqrt((model(test_batch).data[0][0].item()**2)+(model(test_batch).data[0][1].item()**2)))
    
    
    

  inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])


Epoch: 1000 loss = 0.002421
Epoch: 2000 loss = 0.000486
Epoch: 3000 loss = 0.000175
Epoch: 4000 loss = 0.000077
Epoch: 5000 loss = 0.000038
tensor([[0]])
sorry hate you is Bad Mean...
Confidence:  0.8778114478071617


The program with CNN is nearly 85% sure that it has correctly labeled the statement "sorry hate you" as a negative statement. This results in the correct conclusion! 

## BOW

In [2]:
#BOW
#Establish training data
data = [("i love you".split(), "Good"),
        ("she likes baseball".split(), "Good"),
        ("he loves me".split(), "Good"),
        ("sorry for that".split(), "Bad"),
        ("this is awful".split(), "Bad"),
        ("i hate you".split(), "Bad")]

#Test Data
test_data = [("he likes soccer".split(), "Good"),
             ("sorry hate you".split(), "Bad")]

#Add words to our dictionary
word_to_ix = {}
for sentence, _ in data + test_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2 #Good or Bad

##########First Define Model###############

class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_labels) #linear affine map

    def forward(self, bow_vec):
        temp=self.linear(bow_vec)
        return F.log_softmax(temp, dim=1) # 1 dimention for prob distribution


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1 #count up how many times we hit the word of interest
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]]) #desired classification


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE) #define model

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

##########Now Training the data############
print("TRAINING...")
label_to_ix = {"Bad": 0, "Good": 1} #define categories
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)
# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["hate"]])


loss_function = nn.NLLLoss() #define loss function
optimizer = optim.SGD(model.parameters(), lr=0.1) #define optimizer

#Pass over training set each epoch
for epoch in range(100):
    count=0
    for instance, label in data:
        model.zero_grad() # Re-clear all gradients before restarting an epoch
        
        #make vocab vector and define targets
        bow_vec = make_bow_vector(instance, word_to_ix)
        target = make_target(label, label_to_ix)

        log_probs = model(bow_vec) #run through model

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target) #compute loss
        loss.backward() #read gradients
        optimizer.step() #update parameters from new gradients
        count=count+1
    #print('Epoch: ',epoch)
    #print('      Total ',count,' Instances.')

print("Final Loss: ", loss)
#################Finally Test Data#######################
print("TESTING...")
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

print("Classifying: hate you sorry")
word1=next(model.parameters())[:, word_to_ix["sorry"]]
word2=next(model.parameters())[:, word_to_ix["hate"]]
word3=next(model.parameters())[:, word_to_ix["you"]]
print('Confidence: ', (word1[0].item()+word2[0].item()+word3[0].item())/3)

{'i': 0, 'love': 1, 'you': 2, 'she': 3, 'likes': 4, 'baseball': 5, 'he': 6, 'loves': 7, 'me': 8, 'sorry': 9, 'for': 10, 'that': 11, 'this': 12, 'is': 13, 'awful': 14, 'hate': 15, 'soccer': 16}
Parameter containing:
tensor([[-0.0288,  0.1272, -0.1876, -0.2333,  0.0304,  0.0265,  0.1044, -0.2144,
         -0.1282, -0.1028,  0.1903,  0.1953,  0.2172, -0.1737, -0.0310, -0.2144,
          0.1312],
        [-0.2393,  0.1496,  0.2219, -0.0713, -0.0452,  0.1456, -0.0363, -0.0342,
          0.2089,  0.1294, -0.0661, -0.1787,  0.1558,  0.1872, -0.0813,  0.1612,
         -0.1555]], requires_grad=True)
Parameter containing:
tensor([0.1117, 0.1758], requires_grad=True)
TRAINING...
tensor([[-0.4975, -0.9366]])
tensor([[-1.3733, -0.2920]])
tensor([-0.2144,  0.1612], grad_fn=<SelectBackward0>)
Final Loss:  tensor(0.0534, grad_fn=<NllLossBackward0>)
TESTING...
tensor([[-2.1001, -0.1306]])
tensor([[-0.0275, -3.6079]])
Classifying: hate you sorry
Confidence:  0.5812122573455175


The confidence of Bag of Words correctly identifying the statement as negative is slightly worse at a value under 80%. From the tutrials Simon followed, this is to be expected because BOW is commonly used as a preface or placeholder model when quickly putting together a NN. CNN and likely RNN below are more advanced and respected models. 
## RNN

In [3]:
#Imports:
#General Imports
import os
import numpy as np 
import pandas as pd 
import time
import random
import spacy

#visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

#All Imports for Pytorch
import torch
import torch.nn.functional as F
import torchtext
torch.backends.cudnn.deterministic = True

#All Imports for Tensorflow
#import tensorflow as tf
#import tensorflow_hub as hub
#import tensorflow_text as text
#from official.nlp import optimization  # to create AdamW optimizer
#tf.get_logger().setLevel('ERROR')

#Dependencies, Downloads, and other necessary installations
#!conda install spacy
#!pip install wordcloud
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        #self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        return output

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()

def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100




In [4]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
!gunzip -f movie_data.csv.gz 




--2022-01-26 11:42:59--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2022-01-26 11:43:00--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz’


2022-01-26 11:43:02 (13.7 MB/s) - ‘movie_data.csv.gz’ saved [2652

In [5]:
df = pd.read_csv('movie_data.csv')

RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 8
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 16        #128  USING THESE DIMENSIONS MAKES MODEL FAR TOO LARGE FOR ME TO FEASIBLY TRAIN
HIDDEN_DIM =  24         #256
NUM_CLASSES = 2

### Defining the feature processing

TEXT = torchtext.legacy.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

### Defining the label processing

LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

fields = [('TEXT_COLUMN_NAME', TEXT), ('LABEL_COLUMN_NAME', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path='movie_data.csv', format='csv',
    skip_header=True, fields=fields)

torch.manual_seed(RANDOM_SEED)
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')
print(vars(train_data.examples[0]))

TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10]) # itos = integer-to-string
print(LABEL.vocab.stoi)
LABEL.vocab.freqs

train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.TEXT_COLUMN_NAME),
         device=DEVICE
    )

#MAYYYYBE TRAIN STUFF HERE
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

model.load_state_dict(torch.load("./bestRNNweights"))

import spacy


nlp = spacy.blank("en")


Num Train: 40000
Num Test: 10000
Num Train: 34000
Num Validation: 6000
{'TEXT_COLUMN_NAME': ['For', 'some', 'unknown', 'reason', ',', '7', 'years', 'ago', ',', 'I', 'watched', 'this', 'movie', 'with', 'my', 'mother', 'and', 'sister', '.', 'I', 'do', "n't", 'think', 'I', "'ve", 'ever', 'laughed', 'as', 'hard', 'with', 'them', 'before', '.', 'This', 'movie', 'was', 'sooooo', 'bad', '.', 'How', 'sequels', 'were', 'produced', 'is', 'beyond', 'me', '.', 'Its', 'been', 'awhile', 'since', 'I', 'last', 'saw', 'this', '"', 'movie', '"', ',', 'but', 'the', 'one', 'impression', 'that', 'it', 'has', 'stuck', 'with', 'me', 'over', 'the', 'years', 'has', 'been', ',', '"', 'They', 'must', 'have', 'found', 'the', 'script', 'in', 'a', 'dumpster', 'in', 'the', 'backlot', 'of', 'a', 'cheap', 'movie', 'studio', ',', 'made', 'into', 'a', '"', 'movie', '"', ',', 'and', 'decided', 'that', 'it', 'did', "n't", 'suck', 'enough', ',', 'and', 'made', 'it', 'worse', '.', 'I', "'m", 'pretty', 'sure', 'that', 'they'

## Ultimate RNN Transfer Learning Result

In [9]:
val = 1-predict_sentiment(model, "sorry hate you is Bad Mean...")
print(f'The RNN transfer learning model is:{val} confident that the review above would be negative!')

The RNN transfer learning model is:0.9627192541956902 confident that the review above would be negative!


# Conclusion
The final order of confidance in final classification was Transfer Learning RNN>CNN>BOW.  We recognize that the CNN and BOW datasets here were trained using FAR less data than the transfer learning RNN. Thus our group hopes to do more analysis in a future week to make a more fair comparison.  From above we did solidify that CNN did better on a very small dataset than BOW!