# **RNN Classifier for Tweet Data**

This notebook trains the model for an RNN classifier of our data. Word vectorization and recurrent neural network setup followed from youtube.


First install old version of torchtext because new ones have issues with legacy tools.

In [1]:
# !conda install -c pytorch torcht ext==0.6.0
# !conda install pytorch::pytorch torchvision torchaudio -c pytorch

Import necessary libraries

In [2]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

from fastprogress.fastprogress import master_bar, progress_bar

Settings for different parameters.

In [3]:
# select regularization type and strength
L1_REG = True
l1_lambda = 0.01

L2_REG = False
l2_lambda = 0.01

# select data balancing choice
choice = "deletion"


RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 10000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2
DATA_PATH = "./tweets.csv"

## Setup and Format Data


Balance data by either duplication or deletion (to prevent overfitting)

In [4]:

df = pd.read_csv("./tweets.csv")

# Count the occurrences of each label
label_counts = df['label'].value_counts()

# Count minority labels
minority_label = label_counts.idxmin()
minority_count = label_counts.min()

# Find indices of the majority class to delete excess samples
majority_indices = df[df['label'] != minority_label].index
excess_majority_indices = majority_indices[minority_count:]

if choice == "deletion":
  # Delete excess samples from the majority class to balance the dataset
  balanced_df = df.drop(excess_majority_indices)
  balanced_df.to_csv("./tweetsDEL.csv", index=False)
  DATA_PATH = "./tweetsDEL.csv"
if choice == "duplication":
  duplicate_indices = pd.Series(minority_indices).sample(n=abs(label_counts.diff().values[0]), replace=True).values
  duplicated_samples = df.loc[duplicate_indices]
  balanced_df = df.append(duplicated_samples, ignore_index=True)
  balanced_df.to_csv("./tweetsDUP.csv", index=False)
  DATA_PATH = "./tweetsDUP.csv"

# Verify the new class distribution
print(balanced_df['label'].value_counts())

label
0    8816
1    8816
Name: count, dtype: int64


Define text and label formatters

In [5]:
import en_core_web_sm
nlp = en_core_web_sm.load()

TEXT = torchtext.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
LABEL = torchtext.data.LabelField(dtype=torch.long)

Process/format data.

In [6]:
fields = [("text", TEXT), ("label", LABEL)]
dataset = torchtext.data.TabularDataset(
    path=DATA_PATH, format='csv',
    skip_header=True, fields=fields)

## Split Dataset into Test/Train/Validation sets

Test/Train split


In [7]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

Split train into training and validation sets.

In [8]:
train_data, valid_data = train_data.split(
    split_ratio=[0.8, 0.20],
    random_state=random.seed(RANDOM_SEED))

Show sizes of sets and example text data.

In [9]:
print(f'Test Size: {len(train_data)}')
print(f'Train Size: {len(train_data)}')
print(f'Validation Size: {len(valid_data)}')
print(vars(train_data.examples[2]))

Test Size: 11285
Train Size: 11285
Validation Size: 2821
{'text': ['#', '63millionbabies', ' ', 'who', 'have', 'been', 'killed', 'by', 'ROE', '\n', '#', 'RoeVWade', '\n', '#', 'SupremeCourt'], 'label': '1'}


## Build Vocabulary
Valid words are the top frequent VOCABULARY_SIZE words.

In [10]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data, max_size = 2)
print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Label size: {len(LABEL.vocab)}')
print(TEXT.vocab.freqs.most_common(20)) # most common words

Vocabulary size: 10002
Label size: 2
[('.', 19380), ('the', 12468), ('to', 11163), (',', 10679), ('abortion', 7932), ('a', 6473), ('of', 6409), ('is', 6252), ('#', 6205), ('and', 6201), ('in', 4539), ('I', 4154), ('\n\n', 3974), ('that', 3902), ('for', 3884), ('!', 3678), ('\n', 3630), ('you', 2917), ('are', 2868), ('it', 2793)]


In [11]:
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(TEXT.vocab.stoi['the'])
print(LABEL.vocab.stoi)
LABEL.vocab.freqs

[('.', 19380), ('the', 12468), ('to', 11163), (',', 10679), ('abortion', 7932), ('a', 6473), ('of', 6409), ('is', 6252), ('#', 6205), ('and', 6201), ('in', 4539), ('I', 4154), ('\n\n', 3974), ('that', 3902), ('for', 3884), ('!', 3678), ('\n', 3630), ('you', 2917), ('are', 2868), ('it', 2793)]
['<unk>', '<pad>', '.', 'the', 'to', ',', 'abortion', 'a', 'of', 'is']
3
defaultdict(None, {'1': 0, '0': 1})


Counter({'1': 5694, '0': 5591})

## Data Loader


In [12]:
train_loader, valid_loader, test_loader = \
    torchtext.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.text),
         device=DEVICE
    )

In [13]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([67, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([11, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([10, 128])
Target vector size: torch.Size([128])


## Build Neural Network

In [14]:
class RNN(torch.nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim)

        self.fc = torch.nn.Linear(hidden_dim, output_dim)


    def forward(self, text):
        # text dim: [sentence length, batch size]

        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]

        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]

        output = self.fc(hidden)
        return output

In [15]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005) # use weight decay for l2 regularization

## Train Neural Network

In [16]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [17]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):

        text = batch_data.text.to(DEVICE)
        labels = batch_data.label.to(DEVICE)

        ### FORWARD AND BACK PROP

        logits = model(text)

        # regularization (optional)
        loss = F.cross_entropy(logits, labels) # cross entropy loss tends to be better for classification problems
        #if L1_REG:


        optimizer.zero_grad()

        loss.backward()

        # UPDATE MODEL PARAMETERS
        optimizer.step()

        # LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/089 | Loss: 0.7045
Epoch: 001/015 | Batch 050/089 | Loss: 0.6932
training accuracy: 50.63%
valid accuracy: 50.09%
Time elapsed: 0.24 min
Epoch: 002/015 | Batch 000/089 | Loss: 0.6926
Epoch: 002/015 | Batch 050/089 | Loss: 0.6919
training accuracy: 50.73%
valid accuracy: 50.62%
Time elapsed: 0.49 min
Epoch: 003/015 | Batch 000/089 | Loss: 0.6921
Epoch: 003/015 | Batch 050/089 | Loss: 0.6944
training accuracy: 50.70%
valid accuracy: 49.88%
Time elapsed: 0.73 min
Epoch: 004/015 | Batch 000/089 | Loss: 0.6970
Epoch: 004/015 | Batch 050/089 | Loss: 0.6948
training accuracy: 49.75%
valid accuracy: 48.88%
Time elapsed: 0.98 min
Epoch: 005/015 | Batch 000/089 | Loss: 0.6898
Epoch: 005/015 | Batch 050/089 | Loss: 0.6916
training accuracy: 50.96%
valid accuracy: 51.26%
Time elapsed: 1.22 min
Epoch: 006/015 | Batch 000/089 | Loss: 0.6893
Epoch: 006/015 | Batch 050/089 | Loss: 0.6649
training accuracy: 62.02%
valid accuracy: 56.68%
Time elapsed: 1.47 min
Epoch: 007/015 |

Test on basic in

In [18]:
import spacy


nlp = spacy.blank("en")

def predict_side(model, sentence):

    model.eval()
    tokenized = [token.text for token in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()

print('Probability Pro-Life:')
print(predict_side(model, "women's rights are so important"))
print(predict_side(model, "God loves babies thank god "))
print(predict_side(model, "We have cats"))
print(predict_side(model, "Potatoes"))
print(predict_side(model, "According to all known laws of aviation, a bee should not be able to fly."))




Probability Pro-Life:
0.4370650053024292
0.9980461597442627
0.06932768225669861
0.5108727812767029
0.38892319798469543


In [62]:
print("{:85} {}".format("Prompt", "Output"))
for cat_count in range(10):
    prompt = f"God and the holy spirit love babies " + ("cats " * cat_count)
    result = predict_side(model, prompt)
    print("{:85} {}".format(prompt, result))

Prompt                                                                                Output
God and the holy spirit love babies                                                   0.9992449283599854
God and the holy spirit love babies cats                                              0.9941451549530029
God and the holy spirit love babies cats cats                                         0.9525774121284485
God and the holy spirit love babies cats cats cats                                    0.8094621300697327
God and the holy spirit love babies cats cats cats cats                               0.3293845057487488
God and the holy spirit love babies cats cats cats cats cats                          0.013379561714828014
God and the holy spirit love babies cats cats cats cats cats cats                     0.0036522522568702698
God and the holy spirit love babies cats cats cats cats cats cats cats                0.0005706199444830418
God and the holy spirit love babies cats cats cats cats cat

In [19]:
torch.save(model.state_dict(), 'RNN_deletion.pth')
