# Tweet sentiment with Deep Neural Network   
 
## Outline
- [Import libraries](#1)
- [Importing the data](#2)
- [Defining classes](#3)
- [Training](#4)
- [Evaluation  ](#5)
- [Testing with your own input](#6)


In [None]:
import os 
import random as rnd

import trax
from trax.fastmath import numpy as np #fastnp
trax.fastmath.use_backend('jax')

from trax import layers as tl
from trax.supervised import training

# import Layer from the utils.py file
#from utils import Layer, load_tweets, process_tweet
#from utils import 


<a name="2"></a>
# Importing the data

In [None]:
import string
import re
import os
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples 

In [None]:
def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets

In [None]:
all_positive_tweets, all_negative_tweets = load_tweets()

print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

# Split
val_pos   = all_positive_tweets[4000:] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets
val_neg   = all_negative_tweets[4000:] # generating validation set for negative tweets
train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets

train_x = train_pos + train_neg 
val_x  = val_pos + val_neg

# Set labels
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

In [None]:
stopwords_english = stopwords.words('english')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [None]:
# Try out function that processes tweets
print("original tweet at training position 0")
print(train_pos[0])

print("Tweet at training position 0 after processing:")
process_tweet(train_pos[0])

<a name="2.2"></a>
## Building the vocabulary

In [None]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)
    
print("Total words in vocab are",len(Vocab))
display(Vocab)

<a name="2.3"></a>
## Converting a tweet to a tensor

##### Example
Input a tweet:
```CPP
'@happypuppy, is Maria happy?'
```

The tweet_to_tensor will first conver the tweet into a list of tokens (including only relevant words)
```CPP
['maria', 'happi']
```

Then it will convert each word into its unique integer

```CPP
[2, 56]
```
- Notice that the word "maria" is not in the vocabulary, so it is assigned the unique integer associated with the `__UNK__` token, because it is considered "unknown."



In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''  
    # Process the tweet into a list of words
    # where only important words are kept (stop words removed)
    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    tensor_l = []
    unk_ID = vocab_dict[unk_token]
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    for word in word_l:
        word_ID = vocab_dict.get(word, unk_ID)
        tensor_l.append(word_ID) 
    return tensor_l

In [None]:
print("Actual tweet is\n", val_pos[0])
print("\nTensor of tweet:\n", tweet_to_tensor(val_pos[0], vocab_dict=Vocab))

<a name="2.4"></a>
## Creating a batch generator

- This generator returns the data in a format (tensors) that you could directly use in your model.
- It returns a triple: the inputs, targets, and loss weights:
-- Inputs is a tensor that contains the batch of tweets we put into the model.

In [None]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    '''
    Input: 
        data_pos - Set of posstive examples
        data_neg - Set of negative examples
        batch_size - number of samples per batch. Must be even
        loop - True or False
        vocab_dict - The words dictionary
        shuffle - Shuffle the data order
    Yield:
        inputs - Subset of positive and negative examples
        targets - The corresponding labels for the subset
        example_weights - An array specifying the importance of each example
        
    '''     
    # make sure the batch size is an even number
    # to allow an equal number of positive and negative samples
    assert batch_size % 2 == 0
    
    # Number of positive examples in each batch is half of the batch size
    # same with number of negative examples in each batch
    n_to_take = batch_size // 2
    
    # Use pos_index to walk through the data_pos array
    # same with neg_index and data_neg
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    
    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)
        
    stop = False
    while not stop:  
        batch = []
        
        # Positive
        for i in range(n_to_take):
            if pos_index >= len_data_pos: 
                if not loop:
                    stop = True;
                    break;
                pos_index = 0
                if shuffle:
                    rnd.shuffle(pos_index_lines)     
            tweet = data_pos[pos_index_lines[pos_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            pos_index = pos_index + 1

        # Negative
        for i in range(n_to_take):
            if neg_index >= len_data_neg: 
                if not loop:
                    stop = True;
                    break;
                neg_index = 0
                if shuffle:
                    rnd.shuffle(neg_index_lines)
            tweet = data_neg[neg_index_lines[neg_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            neg_index = neg_index + 1

        if stop:
            break;

        # so that it's n_to_take positions after the current index
        pos_index += n_to_take
        neg_index += n_to_take
        
        # pad all shorter tweets to have this length
        max_len = max([len(t) for t in batch]) 
        tensor_pad_l = []
        for tensor in batch:
            n_pad = max_len - len(tensor)
            pad_l = [0] * n_pad
            tensor_pad = tensor + pad_l
            tensor_pad_l.append(tensor_pad)

        inputs = np.array(tensor_pad_l)
        target_pos = [1] * n_to_take
        target_neg = [0] * n_to_take
        target_l = target_pos + target_neg
        targets = np.array(target_l)

        example_weights = np.ones_like(targets)
        
        yield inputs, targets, example_weights

In [None]:
rnd.seed(30)

def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

# inspect.
inputs, targets, example_weights = next(train_generator(4, shuffle=True))

print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

In [None]:
# Test the train_generator

tmp_data_gen = train_generator(batch_size = 4)
tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print(f"The inputs shape is {tmp_inputs.shape}")
print(f"The targets shape is {tmp_targets.shape}")
print(f"The example weights shape is {tmp_example_weights.shape}")

for i,t in enumerate(tmp_inputs):
    print(f"input tensor: {t}; target {tmp_targets[i]}; example weights {tmp_example_weights[i]}")

## Trax test

In [None]:
train_tuples = zip(train_x, train_y)

In [None]:
print(next(train_tuples))
print(next(train_tuples))
print(next(train_tuples))

In [None]:
tensor_pipeline = trax.data.Serial(
    #lambda g: filter(lambda x: x[1] < 1.1, g),
    lambda g: map(lambda tweet: tweet_to_tensor(tweet[0], vocab_dict=Vocab), g),
    #lambda g: map(lambda tensor : tensor + [0] * , g),
)

In [None]:
train_stream = tensor_pipeline(train_tuples)

In [None]:
print(next(train_stream))
print(next(train_stream))

<a name="3.3"></a>
## Model


In [None]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
        
    embed_layer = tl.Embedding(
        vocab_size=vocab_size,
        d_feature=embedding_dim)
    
    mean_layer = tl.Mean(axis=1)

    dense_output_layer = tl.Dense(n_units = output_dim)

    log_softmax_layer = tl.LogSoftmax()
    
    # trax.layers.combinators.Serial
    model = tl.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer,
      log_softmax_layer
    )

    return model

In [None]:
tmp_model = classifier()

In [None]:
print(type(tmp_model))
display(tmp_model)

<a name="4.1"></a>
## Training the model


In [None]:
from trax.supervised import training

batch_size = 16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)

eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
    #n_eval_batches=20  # For less variance in eval numbers.
)

model = classifier()

In [None]:
output_dir = '~/model/'
output_dir_expand = os.path.expanduser(output_dir)
!rm -rf {output_dir}
print(output_dir_expand)

In [None]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    '''
    Input: 
        classifier - the model you are building
        train_task - Training task
        eval_task - Evaluation task
        n_steps - the evaluation steps
        output_dir - folder to save your files
    Output:
        trainer -  trax trainer
    '''
    training_loop = training.Loop(classifier,
                                  train_task,
                                  eval_tasks = [eval_task],
                                  output_dir = output_dir)
    training_loop.run(n_steps = n_steps)

    return training_loop

In [None]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)

##### Expected output (Approximately)
```CPP
Step      1: train CrossEntropyLoss |  0.88939196
Step      1: eval  CrossEntropyLoss |  0.68833977
Step      1: eval          Accuracy |  0.50000000
Step     10: train CrossEntropyLoss |  0.61036736
Step     10: eval  CrossEntropyLoss |  0.52182281
Step     10: eval          Accuracy |  0.68750000
Step     20: train CrossEntropyLoss |  0.34137666
Step     20: eval  CrossEntropyLoss |  0.20654774
Step     20: eval          Accuracy |  1.00000000
Step     30: train CrossEntropyLoss |  0.20208922
Step     30: eval  CrossEntropyLoss |  0.21594886
Step     30: eval          Accuracy |  0.93750000
Step     40: train CrossEntropyLoss |  0.19611198
Step     40: eval  CrossEntropyLoss |  0.17582777
Step     40: eval          Accuracy |  1.00000000
Step     50: train CrossEntropyLoss |  0.11203773
Step     50: eval  CrossEntropyLoss |  0.07589275
Step     50: eval          Accuracy |  1.00000000
Step     60: train CrossEntropyLoss |  0.09375446
Step     60: eval  CrossEntropyLoss |  0.09290724
Step     60: eval          Accuracy |  1.00000000
Step     70: train CrossEntropyLoss |  0.08785903
Step     70: eval  CrossEntropyLoss |  0.09610598
Step     70: eval          Accuracy |  1.00000000
Step     80: train CrossEntropyLoss |  0.08858261
Step     80: eval  CrossEntropyLoss |  0.02319432
Step     80: eval          Accuracy |  1.00000000
Step     90: train CrossEntropyLoss |  0.05699894
Step     90: eval  CrossEntropyLoss |  0.01778970
Step     90: eval          Accuracy |  1.00000000
Step    100: train CrossEntropyLoss |  0.03663783
Step    100: eval  CrossEntropyLoss |  0.00210550
Step    100: eval          Accuracy |  1.00000000
```

<a name="4.2"></a>
## Practice Making a prediction


In [None]:
tmp_train_generator = train_generator(16)
tmp_batch = next(tmp_train_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

print(f"The batch is a tuple of length {len(tmp_batch)} because position 0 contains the tweets, and position 1 contains the targets.") 
print(f"The shape of the tweet tensors is {tmp_inputs.shape} (num of examples, length of tweet tensors)")
print(f"The shape of the labels is {tmp_targets.shape}, which is the batch size.")
print(f"The shape of the example_weights is {tmp_example_weights.shape}, which is the same as inputs/targets size.")

In [None]:
tmp_pred = training_loop.eval_model(tmp_inputs)
print(f"The prediction shape is {tmp_pred.shape}, num of tensor_tweets as rows")
print("Column 0 is the probability of a negative sentiment (class 0)")
print("Column 1 is the probability of a positive sentiment (class 1)")
print()
print("View the prediction array")
tmp_pred

In [None]:
# turn probabilites into category predictions
tmp_is_positive = tmp_pred[:,1] > tmp_pred[:,0]
for i, p in enumerate(tmp_is_positive):
    print(f"Neg log prob {tmp_pred[i,0]:.4f}\tPos log prob {tmp_pred[i,1]:.4f}\t is positive? {p}\t actual {tmp_targets[i]}")

Notice prediction using a training batch.

<a name="5"></a>
# Evaluation  


In [None]:
def compute_accuracy(preds, y, y_weights):
    """
    Input: 
        preds: a tensor of shape (dim_batch, output_dim) 
        y: a tensor of shape (dim_batch, output_dim) with the true labels
        y_weights: a n.ndarray with the a weight for each example
    Output: 
        accuracy: a float between 0-1 
        weighted_num_correct (np.float32): Sum of the weighted correct predictions
        sum_weights (np.float32): Sum of the weights
    """

    is_pos = preds[:,1] > preds[:,0]
    is_pos_int = is_pos.astype(np.int32)
    correct = is_pos_int == y

    sum_weights = np.sum(y_weights)

    correct_float = correct.astype(np.float32)
    weighted_correct_float = np.dot(correct_float, y_weights)
    weighted_num_correct = np.sum(weighted_correct_float)

    accuracy = weighted_num_correct / sum_weights

    return accuracy, weighted_num_correct, sum_weights

In [None]:
# test your function
tmp_val_generator = val_generator(64)

# get one batch
tmp_batch = next(tmp_val_generator)

# Position 0 has the model inputs (tweets as tensors)
# position 1 has the targets (the actual labels)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

# feed the tweet tensors into the model to get a prediction
tmp_pred = training_loop.eval_model(tmp_inputs)

tmp_acc, tmp_num_correct, tmp_num_predictions = compute_accuracy(preds=tmp_pred, y=tmp_targets, y_weights=tmp_example_weights)

print(f"Model's prediction accuracy on a single training batch is: {100 * tmp_acc}%")
print(f"Weighted number of correct predictions {tmp_num_correct}; weighted number of total observations predicted {tmp_num_predictions}")

<a name="5.2"></a>
## Testing your model on Validation Data


In [None]:
def test_model(generator, model):
    '''
    Input: 
        generator: an iterator instance that provides batches of inputs and targets
        model: a model instance 
    Output: 
        accuracy: float corresponding to the accuracy
    '''
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0

    for batch in generator: 
        inputs = batch[0]
        targets = batch[1]
        example_weight = batch[2]

        pred = model(inputs)
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight)
        
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred

    accuracy = total_num_correct / total_num_pred
    
    return accuracy

In [None]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of your model on the validation set is {accuracy:.4f}', )

<a name="6"></a>
# Testing with your own input


In [None]:
# this is used to predict on your own sentnece
def predict(sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
    inputs = inputs[None, :]  
    preds_probs = model(inputs)
    
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment

In [None]:
# try a positive sentence
sentence = "It's such a nice day, think i'll be taking Sid to Ramsgate fish and chips for lunch at Peter's fish factory and then the beach maybe"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()
# try a negative sentence
#sentence = "I hated my day, it was the worst, I'm so sad."
sentence = "Thats fucking awesome"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")