In [1]:
import string

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import re

import nltk

In [2]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("vader_lexicon")
"""
things to download on the fly if not using py ide:
stopwords
punkt
vader_lexicon

"""

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/datalore/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/datalore/nltk_data...


'\nthings to download on the fly if not using py ide:\nstopwords\npunkt\nvader_lexicon\n\n'

In [3]:
# nltk.download()

sentiment_name = "Sentiment"
text_col_name = "Text"
subjectivity_label_name = "subjectivity"
polarity_label_name = "polarity"
token_col_name = "Raw tokens"
tokenized_col_name = "Tokenized"
length_col_name = "Token length"
ref_sentiment_name = "NLTK ref sentiment"

columns_to_read = [text_col_name, subjectivity_label_name, polarity_label_name]

unk_word_name = "unknown word"
unknown_word_id = -1

truncate_length = 50

use_csv_col_as_idx = False
data_path = "biden_tweets_labeled.csv"

In [4]:
# tweets_csv = pd.read_csv(data_path)
# if tweets_csv.columns[0] == "Unnamed: 0":
if use_csv_col_as_idx:
    print(f"first column as index, reading csv")
    tweets_csv = pd.read_csv(data_path, index_col=[0])
else:
    print(f"first column is named, fall back to specify used_cols")
    tweets_csv = pd.read_csv(data_path, usecols=columns_to_read)

first column is named, fall back to specify used_cols


In [5]:
tweets_csv

Unnamed: 0,Text,subjectivity,polarity
0,@RT_com That’s the guy who is funding those fa...,1,1
1,Biden apparently just told JTaps that he's goi...,1,0
2,@Kingofgameplay1 @HeathMayo They've been given...,1,0
3,@conorjrogers @reedgalen They could not raise ...,1,1
4,Can`t Biden just fire the board members on the...,1,0
...,...,...,...
1761,"@KThomasDC @costareports That’s nice, but I ho...",0,2
1762,@livingdead1970 OMG. You are a sensitive soul...,1,1
1763,@bryceagen @itsJeffTiedrich @realDonaldTrump @...,1,1
1764,@Tomboliko @the_resistor @realDonaldTrump Hill...,1,0


In [6]:
len(tweets_csv)

1766

In [7]:
stopwords = nltk.corpus.stopwords.words("english")

overall_tokens = []

"""
punct to replace: 
’ to '
` to '


"""

def remove_at_tags(x: pd.Series):
    x[text_col_name]: str
    words = x[text_col_name].split()
    for idx in range(len(words)):
        words[idx] = words[idx].replace("’", "'")
        words[idx] = words[idx].replace("`", "'")
    
    words = [x if not re.match(r"https?:", x) else "website_name" for x in words]
    words_w_at_tags = [x for x in words if not re.match(r".*@.*", x)]

    result = ''
    for elem in words_w_at_tags:
        result += elem + ' '
    return result


def tweet_en_tokenize(x: pd.Series):
    global overall_tokens
    tokens = word_tokenize(x[text_col_name])
    tokens_w_stops = [x for x in tokens if x not in stopwords]
    overall_tokens += tokens_w_stops
    return tokens_w_stops

label_map_dict = {2:0.5,1:1,0:0}
def apply_self_mapping_of_label(x: pd.Series):
    return label_map_dict[x[polarity_label_name]]

tweets_csv[text_col_name] = tweets_csv.apply(remove_at_tags, axis=1)

tweets_csv[token_col_name] = tweets_csv.apply(tweet_en_tokenize, axis=1)

tweets_csv[polarity_label_name] = tweets_csv.apply(apply_self_mapping_of_label, axis=1)

tweets_csv[length_col_name] = tweets_csv.apply(lambda x: len(x[token_col_name]), axis=1)

tweets_csv = tweets_csv[tweets_csv[length_col_name] <= 50]

tweets_csv

Unnamed: 0,Text,subjectivity,polarity,Raw tokens,Token length
0,That's the guy who is funding those fake stori...,1,1.0,"[That, 's, guy, funding, fake, stories, Hunter...",9
1,Biden apparently just told JTaps that he's goi...,1,0.0,"[Biden, apparently, told, JTaps, 's, going, as...",22
2,They've been given 40 chances. And have blown ...,1,0.0,"[They, 've, given, 40, chances, ., And, blown,...",40
3,They could not raise the money to beat Biden b...,1,1.0,"[They, could, raise, money, beat, Biden, elect...",23
4,Can't Biden just fire the board members on the...,1,0.0,"[Ca, n't, Biden, fire, board, members, postal,...",11
...,...,...,...,...,...
1761,"That's nice, but I hope Biden doesn't think #M...",0,0.5,"[That, 's, nice, ,, I, hope, Biden, n't, think...",12
1762,OMG. You are a sensitive soul. For the record ...,1,1.0,"[OMG, ., You, sensitive, soul, ., For, record,...",38
1763,"No, IQ45 is trying to steal the election from ...",1,1.0,"[No, ,, IQ45, trying, steal, election, Biden, ...",25
1764,Hillary just didn't cheat enough last time. Th...,1,0.0,"[Hillary, n't, cheat, enough, last, time, ., T...",33


In [8]:
tweet_freq_dict = nltk.FreqDist(overall_tokens)
print(type(tweet_freq_dict))
tweet_freq_dict.tabulate(25)

<class 'nltk.probability.FreqDist'>
           .        Biden            ,            !           's website_name            I            ?          n't        Trump            #          Joe            :          The    President        would          ...            ;     election       people         like            &          100          amp           '' 
        2042         1660         1083          658          588          531          426          410          349          319          301          292          176          160          143          137          137          128          127          115          108          107          101          100           99 


In [9]:
vocab_to_int_encoding = {pair[1]:pair[0]+1 for pair in enumerate(tweet_freq_dict)}
print(len(vocab_to_int_encoding))
print(type(vocab_to_int_encoding))
vocab_to_int_encoding

7171
<class 'dict'>


{'.': 1,
 'Biden': 2,
 ',': 3,
 '!': 4,
 "'s": 5,
 'website_name': 6,
 'I': 7,
 '?': 8,
 "n't": 9,
 'Trump': 10,
 '#': 11,
 'Joe': 12,
 ':': 13,
 'The': 14,
 'President': 15,
 'would': 16,
 '...': 17,
 ';': 18,
 'election': 19,
 'people': 20,
 'like': 21,
 '&': 22,
 '100': 23,
 'amp': 24,
 "''": 25,
 '``': 26,
 'He': 27,
 'days': 28,
 'votes': 29,
 'It': 30,
 'know': 31,
 'Americans': 32,
 'You': 33,
 'president': 34,
 'BIDEN': 35,
 'get': 36,
 'going': 37,
 'first': 38,
 '“': 39,
 '”': 40,
 "'": 41,
 'wear': 42,
 'masks': 43,
 'says': 44,
 'We': 45,
 'biden': 46,
 'administration': 47,
 'office': 48,
 'one': 49,
 'fraud': 50,
 '-': 51,
 'ask': 52,
 'And': 53,
 'If': 54,
 'think': 55,
 'via': 56,
 'Fauci': 57,
 'vote': 58,
 'That': 59,
 'could': 60,
 'said': 61,
 ')': 62,
 'say': 63,
 'win': 64,
 'still': 65,
 'want': 66,
 "'m": 67,
 'President-elect': 68,
 "'re": 69,
 'got': 70,
 '(': 71,
 'voted': 72,
 'They': 73,
 'US': 74,
 'team': 75,
 'see': 76,
 'right': 77,
 'What': 78,
 'CNN':

In [10]:
truncate_to_unknown_corpus_length_limit = 5000
assert truncate_to_unknown_corpus_length_limit < len(vocab_to_int_encoding), "unknown truncation limit must be smaller than corpus length"
vocab_to_int_encoding["<unk>"] = truncate_to_unknown_corpus_length_limit + 1

def tokens_to_int(x: pd.Series):
    tokens = x[token_col_name]
    try:
        tokens_in_int = [vocab_to_int_encoding[token] for token in tokens]
        for idx in range(len(tokens_in_int)):
            if tokens_in_int[idx] >= truncate_to_unknown_corpus_length_limit:
                tokens_in_int[idx] = truncate_to_unknown_corpus_length_limit+1
    except KeyError:
        print(x)
        return -1
    return tokens_in_int

tweets_csv[tokenized_col_name] = tweets_csv.apply(tokens_to_int, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_csv[tokenized_col_name] = tweets_csv.apply(tokens_to_int, axis=1)


In [11]:
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(overall_tokens)
bigram_finder.ngram_fd.tabulate(10)

     ('Joe', 'Biden')            ('!', '!')       ('Biden', "'s")        ('.', 'Biden')        ('Biden', '.') ('.', 'website_name')            ('.', 'I')          ('&', 'amp')          ('amp', ';')       ('100', 'days') 
                  259                   184                   146                   126                   112                   107                   106                   100                   100                    84 


In [12]:
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(overall_tokens)
trigram_finder.ngram_fd.tabulate(10)

                  ('&', 'amp', ';')                     ('!', '!', '!')      ('Americans', 'wear', 'masks')        ('ask', 'Americans', 'wear')            ('first', '100', 'days')           ('masks', 'first', '100')          ('wear', 'masks', 'first') ('President-elect', 'Joe', 'Biden')                   ('.', 'It', "'s")              ('Joe', 'Biden', "'s") 
                                100                                  83                                  46                                  44                                  39                                  34                                  33                                  32                                  29                                  26 


In [13]:
quadgram_finder = nltk.collocations.QuadgramCollocationFinder.from_words(overall_tokens)
quadgram_finder.ngram_fd.tabulate(10)

   ('ask', 'Americans', 'wear', 'masks')        ('masks', 'first', '100', 'days')                     ('!', '!', '!', '!')  ('Americans', 'wear', 'masks', 'first')        ('wear', 'masks', 'first', '100')    ('Biden', 'says', 'ask', 'Americans')     ('says', 'ask', 'Americans', 'wear')    ('Biden', 'ask', 'Americans', 'wear') ('Fauci', 'chief', 'medical', 'adviser')      ('Exclusive', ':', 'Biden', 'says') 
                                      41                                       34                                       34                                       33                                       33                                       20                                       19                                       18                                       18                                       15 


In [14]:
sia = SentimentIntensityAnalyzer()


def tweet_find_nltk_polarity(x: pd.Series):
    senti = sia.polarity_scores(x[text_col_name])
    return senti['compound']


tweets_csv[ref_sentiment_name] = tweets_csv.apply(tweet_find_nltk_polarity, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_csv[ref_sentiment_name] = tweets_csv.apply(tweet_find_nltk_polarity, axis=1)


In [15]:
# pad features
def pad_tokens(x: pd.Series):
    tokens = x[tokenized_col_name]
    padding = [0] * (50 - len(tokens))
    return padding + tokens

tweets_csv.loc[:, tokenized_col_name] = tweets_csv.apply(pad_tokens, axis=1)
tweets_csv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,Text,subjectivity,polarity,Raw tokens,Token length,Tokenized,NLTK ref sentiment
0,That's the guy who is funding those fake stori...,1,1.0,"[That, 's, guy, funding, fake, stories, Hunter...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.4767
1,Biden apparently just told JTaps that he's goi...,1,0.0,"[Biden, apparently, told, JTaps, 's, going, as...",22,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.2732
2,They've been given 40 chances. And have blown ...,1,0.0,"[They, 've, given, 40, chances, ., And, blown,...",40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73, 111, 1743, ...",0.8442
3,They could not raise the money to beat Biden b...,1,1.0,"[They, could, raise, money, beat, Biden, elect...",23,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.5367
4,Can't Biden just fire the board members on the...,1,0.0,"[Ca, n't, Biden, fire, board, members, postal,...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.2584
...,...,...,...,...,...,...,...
1761,"That's nice, but I hope Biden doesn't think #M...",0,0.5,"[That, 's, nice, ,, I, hope, Biden, n't, think...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.6956
1762,OMG. You are a sensitive soul. For the record ...,1,1.0,"[OMG, ., You, sensitive, soul, ., For, record,...",38,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1259, 1, ...",0.9001
1763,"No, IQ45 is trying to steal the election from ...",1,1.0,"[No, ,, IQ45, trying, steal, election, Biden, ...",25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.8074
1764,Hillary just didn't cheat enough last time. Th...,1,0.0,"[Hillary, n't, cheat, enough, last, time, ., T...",33,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.5659


In [16]:
# hyperparams for learning

batch_size = 16

lr=0.000125

train_on_gpu = False

In [17]:
tokens_full_series = tweets_csv[tokenized_col_name]
polarity_full_series = tweets_csv[polarity_label_name]
# tokens_full_series.to_list()

In [18]:

tokens_full_nparr = np.asarray(tokens_full_series.to_list(), dtype=int) 
np.random.shuffle(tokens_full_nparr)

train_valid_split_point = int(len(tokens_full_nparr)*0.8)
valid_test_split_point = int(len(tokens_full_nparr)*0.9)

train_tokens = tokens_full_nparr[ : train_valid_split_point]
valid_tokens = tokens_full_nparr[train_valid_split_point : valid_test_split_point]
test_tokens = tokens_full_nparr[valid_test_split_point : ]

polarity_full_nparr = np.asarray(polarity_full_series.to_list(), dtype=int) 
np.random.shuffle(polarity_full_nparr)

train_polarity = polarity_full_nparr[ : train_valid_split_point]
valid_polarity = polarity_full_nparr[train_valid_split_point : valid_test_split_point]
test_polarity = polarity_full_nparr[valid_test_split_point : ]



full_data = TensorDataset(torch.from_numpy(tokens_full_nparr), torch.from_numpy(polarity_full_nparr))
train_data = TensorDataset(torch.from_numpy(train_tokens), torch.from_numpy(train_polarity))
valid_data = TensorDataset(torch.from_numpy(valid_tokens), torch.from_numpy(valid_tokens))
test_data = TensorDataset(torch.from_numpy(test_tokens), torch.from_numpy(test_polarity))

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [19]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        if n_layers == 1:
            drop_prob = 0

        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=n_layers, 
                            dropout=drop_prob, 
                            batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        
    """
    Expected hidden[0] size (2, 14, 256), got [2, 16, 256]
    lstm_stack_size  batch_size  hidden_dim_size
    """
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        # print(f">>> in forward, size of x: {x.size()}, size of hidden: {len(hidden)}, "
        #       f"size of hidden[0]: {hidden[0].size()}")
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        """ Initializes hidden state """
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [20]:

vocab_size = truncate_to_unknown_corpus_length_limit+2 # +1 for the 0 padding, +1 for unknown words
output_size = 1
embedding_dim = 100
hidden_dim = 128
n_layers = 1
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
net

SentimentLSTM(
  (embedding): Embedding(5002, 100)
  (lstm): LSTM(100, 128, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [21]:
# loss and optimization functions
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 6

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if train_on_gpu:
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1
        if len(inputs) < batch_size:
            print(f"last iter of training samples are not consistent with batch size, skip though")
        else:
        

            if train_on_gpu:
                inputs, labels = inputs.cuda(), labels.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            inputs = inputs.type(torch.LongTensor)
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs_in, labels_in in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if train_on_gpu:
                    inputs_in, labels_in = inputs_in.cuda(), labels_in.cuda()

                inputs_in = inputs_in.type(torch.LongTensor)
                output, val_h = net(inputs_in, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

last iter of training samples are not consistent with batch size, skip though
Epoch: 2/6... Step: 100... Loss: 0.622651... Val Loss: 0.675567
last iter of training samples are not consistent with batch size, skip though
Epoch: 3/6... Step: 200... Loss: 0.650788... Val Loss: 0.629772
last iter of training samples are not consistent with batch size, skip though
Epoch: 4/6... Step: 300... Loss: 0.501737... Val Loss: 0.522649
last iter of training samples are not consistent with batch size, skip though
Epoch: 5/6... Step: 400... Loss: 0.526326... Val Loss: 0.569132
last iter of training samples are not consistent with batch size, skip though
Epoch: 6/6... Step: 500... Loss: 0.590776... Val Loss: 0.616621
last iter of training samples are not consistent with batch size, skip though


In [22]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if train_on_gpu:
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.608
Test accuracy: 0.705


In [23]:
print(batch_size)
print(lr)
print(epochs)

16
0.000125
6
