# **RNN Classifier for Tweet Data**

This notebook trains the model for an RNN classifier of our data. Word vectorization and recurrent neural network setup followed from youtube.


First install old version of torchtext because new ones have issues with legacy tools.

In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.16.0
    Uninstalling torchtext-0.16.0:
      Successfully uninstalled torchtext-0.16.0
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


Import necessary libraries

In [None]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

Settings for different parameters.

In [None]:
# select regularization type and strength
L1_REG = True
l1_lambda = 0.01

L2_REG = False
l2_lambda = 0.01

# select data balancing choice
choice = "deletion"


RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 10000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2
DATA_PATH = "/content/tweets.csv"

## Setup and Format Data


Balance data by either duplication or deletion (to prevent overfitting)

In [None]:

df = pd.read_csv("/content/tweets.csv")

# Count the occurrences of each label
label_counts = df['label'].value_counts()

# Count minority labels
minority_label = label_counts.idxmin()
minority_count = label_counts.min()

# Find indices of the majority class to delete excess samples
majority_indices = df[df['label'] != minority_label].index
excess_majority_indices = majority_indices[minority_count:]

if choice == "deletion":
  # Delete excess samples from the majority class to balance the dataset
  balanced_df = df.drop(excess_majority_indices)
  balanced_df.to_csv("/content/tweetsDEL.csv", index=False)
  DATA_PATH = "/content/tweetsDEL.csv"
if choice == "duplication":
  duplicate_indices = pd.Series(minority_indices).sample(n=abs(label_counts.diff().values[0]), replace=True).values
  duplicated_samples = df.loc[duplicate_indices]
  balanced_df = df.append(duplicated_samples, ignore_index=True)
  balanced_df.to_csv("/content/tweetsDUP.csv", index=False)
  DATA_PATH = "/content/tweetsDUP.csv"

# Verify the new class distribution
print(balanced_df['label'].value_counts())

0    6182
1    6182
Name: label, dtype: int64


Define text and label formatters

In [None]:
TEXT = torchtext.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
LABEL = torchtext.data.LabelField(dtype=torch.long)

Process/format data.

In [None]:
fields = [("text", TEXT), ("label", LABEL)]
dataset = torchtext.data.TabularDataset(
    path=DATA_PATH, format='csv',
    skip_header=True, fields=fields)

## Split Dataset into Test/Train/Validation sets

Test/Train split


In [None]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

Split train into training and validation sets.

In [None]:
train_data, valid_data = train_data.split(
    split_ratio=[0.8, 0.20],
    random_state=random.seed(RANDOM_SEED))

Show sizes of sets and example text data.

In [None]:
print(f'Test Size: {len(train_data)}')
print(f'Train Size: {len(train_data)}')
print(f'Validation Size: {len(valid_data)}')
print(vars(train_data.examples[2]))

Test Size: 7913
Train Size: 7913
Validation Size: 1978
{'text': ['Praise', 'GOD', 'Praise', 'GOD', 'roe', 'vs', 'Wade', 'has', 'been', 'put', 'to', 'death', '.', 'This', 'is', 'a', 'great', 'great', 'day', '!', 'This', 'is', 'a', 'win', 'for', 'human', 'life', '✝', '️'], 'label': '1'}


## Build Vocabulary
Valid words are the top frequent VOCABULARY_SIZE words.

In [None]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data, max_size = 2)
print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Label size: {len(LABEL.vocab)}')
print(TEXT.vocab.freqs.most_common(20)) # most common words

Vocabulary size: 10002
Label size: 2
[('.', 13858), ('the', 8770), ('to', 7973), (',', 7531), ('abortion', 5618), ('a', 4698), ('#', 4483), ('is', 4439), ('of', 4419), ('and', 4390), ('in', 3250), ('I', 2881), ('that', 2816), ('\n\n', 2778), ('for', 2776), ('\n', 2685), ('!', 2616), ('you', 2063), ('it', 1994), ('are', 1984)]


In [None]:
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(TEXT.vocab.stoi['the'])
print(LABEL.vocab.stoi)
LABEL.vocab.freqs

[('.', 13858), ('the', 8770), ('to', 7973), (',', 7531), ('abortion', 5618), ('a', 4698), ('#', 4483), ('is', 4439), ('of', 4419), ('and', 4390), ('in', 3250), ('I', 2881), ('that', 2816), ('\n\n', 2778), ('for', 2776), ('\n', 2685), ('!', 2616), ('you', 2063), ('it', 1994), ('are', 1984)]
['<unk>', '<pad>', '.', 'the', 'to', ',', 'abortion', 'a', '#', 'is']
3
defaultdict(None, {'0': 0, '1': 1})


Counter({'0': 4002, '1': 3911})

## Data Loader


In [None]:
train_loader, valid_loader, test_loader = \
    torchtext.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.text),
         device=DEVICE
    )

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([69, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([13, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([12, 128])
Target vector size: torch.Size([128])


## Build Neural Network

In [None]:
class RNN(torch.nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim)

        self.fc = torch.nn.Linear(hidden_dim, output_dim)


    def forward(self, text):
        # text dim: [sentence length, batch size]

        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]

        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]

        output = self.fc(hidden)
        return output

In [None]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005) # use weight decay for l2 regularization

## Train Neural Network

In [None]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):

        text = batch_data.text.to(DEVICE)
        labels = batch_data.label.to(DEVICE)

        ### FORWARD AND BACK PROP

        logits = model(text)

        # regularization (optional)
        loss = F.cross_entropy(logits, labels) # cross entropy loss tends to be better for classification problems
        #if L1_REG:


        optimizer.zero_grad()

        loss.backward()

        # UPDATE MODEL PARAMETERS
        optimizer.step()

        # LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/062 | Loss: 0.6875
Epoch: 001/015 | Batch 050/062 | Loss: 0.6935
training accuracy: 50.63%
valid accuracy: 48.89%
Time elapsed: 0.04 min
Epoch: 002/015 | Batch 000/062 | Loss: 0.6925
Epoch: 002/015 | Batch 050/062 | Loss: 0.6866
training accuracy: 50.28%
valid accuracy: 48.94%
Time elapsed: 0.06 min
Epoch: 003/015 | Batch 000/062 | Loss: 0.6957
Epoch: 003/015 | Batch 050/062 | Loss: 0.6907
training accuracy: 50.64%
valid accuracy: 49.14%
Time elapsed: 0.08 min
Epoch: 004/015 | Batch 000/062 | Loss: 0.6937
Epoch: 004/015 | Batch 050/062 | Loss: 0.6946
training accuracy: 50.64%
valid accuracy: 49.29%
Time elapsed: 0.09 min
Epoch: 005/015 | Batch 000/062 | Loss: 0.6880
Epoch: 005/015 | Batch 050/062 | Loss: 0.6900
training accuracy: 49.87%
valid accuracy: 49.80%
Time elapsed: 0.11 min
Epoch: 006/015 | Batch 000/062 | Loss: 0.6893
Epoch: 006/015 | Batch 050/062 | Loss: 0.6926
training accuracy: 50.78%
valid accuracy: 49.24%
Time elapsed: 0.13 min
Epoch: 007/015 |

Test on basic in

In [None]:
import spacy


nlp = spacy.blank("en")

def predict_side(model, sentence):

    model.eval()
    tokenized = [token.text for token in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()

print('Probability Pro-Life:')
print(predict_side(model, "women's rights are so important"))
print(predict_side(model, "God loves babies thank god "))
print(predict_side(model, "We have cats"))
print(predict_side(model, "Potatoes"))
print(predict_side(model, "According to all known laws of aviation, a bee should not be able to fly."))




Probability Pro-Life:
0.6158949732780457
0.0028248722665011883
0.8079994320869446
0.39335471391677856
0.9499889016151428


In [None]:
torch.save(model.state_dict(), 'RNN_deletion.pth')
