In [1]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [2]:
# Torch loads
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Non-Torch Loads
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

#Cleaning Loads
import regex as re
import emoji
import nltk

In [3]:
train = pd.read_csv("nlp-getting-started/train.csv")
train = train.drop(labels = ["keyword", "location", "id"], axis = 1)
print(train.head())
(train.shape)

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1


(7613, 2)

# EDA
The dataset has 57% non-disaster tweets, and 43% Disaster tweets.  There are 31924 unique words.  This will drive my tuning the vectorization of the model.

In [4]:
train['target'].describe()

count    7613.00000
mean        0.42966
std         0.49506
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: target, dtype: float64

In [5]:
unique_words = set()
for text in train["text"]:
    unique_words.update(text.split())
print(len(unique_words))

31924


# Cleaning
Standard tweet cleaning.  Cleaning found at:
https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

# Tokenizatin
I tolkenized the tweets in preparation to convert to tensors for embedding.


In [6]:
for tweet in train['text']:
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = emoji.replace_emoji(tweet, '') #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    #Tolkenize the Text
    
    tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet = tknzr.tokenize(tweet)

In [7]:
# Split the data into train and test sets
train_dat = train.sample(frac = .8)
test_dat = train.drop(train_dat.index)

# Vectorizing
I used pytorchs internal vectorizer to vectorize the text.  I


In [8]:
# Dataset Importing from https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader

#tokenize the text for tensor load
def tokenize(text):
    # Your tokenizer logic here
    return [vocab[token] for token in text.split()]

class CustomTextDataset(Dataset):
    def __init__(self, data,vocab, transform=None, target_transform=None):
        self.data = data
        self.vocab = vocab
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]
        tokenized_tweet = [self.vocab.get(token, 0) for token in tweet.split()]  # Tokenize and handle unknown tokens
        tweet_tensor = torch.tensor(tokenized_tweet, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)
        
        if self.transform:
            tweet_tensor = self.transform(tweet_tensor)
        if self.target_transform:
            label_tensor = self.target_transform(label_tensor)

        return tweet_tensor, label_tensor

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text.clone().detach())
    return pad_sequence(text_list, batch_first=True), torch.tensor(label_list, dtype=torch.float)

vocab = {'Emergency': 1, 'Non-Emergency':2}

# Creating training and validation datasets
training_set = CustomTextDataset(train_dat, vocab)
validation_set = CustomTextDataset(test_dat, vocab)

# Creating data loaders
training_loader = torch.utils.data.DataLoader(training_set, batch_size=32, shuffle=True, collate_fn=collate_batch)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=32, shuffle=False, collate_fn=collate_batch)


# Model approach
I'm choosing to use a RNN were I only select the last value as the output, so many in and 1 out.  

In [9]:
# Initial Setup

class Tweets(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, nonlinearity):
        super(Tweets, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                                nonlinearity=nonlinearity, bias=True, bidirectional=False)
        self.output_layer = nn.Linear(embedding_dim, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        rnn_output, _ = self.rnn_layer(embedded)
        output = rnn_output[:, -1, :]  # Extract the last timestep output
        final_output = self.output_layer(output)
        return final_output.squeeze()  # Squeeze to remove the extra dimension


In [10]:


def train_one_epoch(epoch_index, model,training_loader, loss_fn, optimizer, tb_writer):
    running_loss = 0.
    last_loss = 0.
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs_cuda = inputs.to('cuda')
        labels_cuda = labels.to('cuda')

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs_cuda)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels_cuda)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 1000 # loss per batch
            #print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

# test execution
#model = Tweets(vocab_size=30000, embedding_dim =1024).to('cuda')
#loss_fn = nn.BCEWithLogitsLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
#tb_writer = SummaryWriter('logs')
#train_one_epoch(1,model, training_loader, loss_fn, optimizer, tb_writer)
#tb_writer.close()

output_data = pd.DataFrame()

Ok, .68 validation loss.  Thats not good.  I'm going to run a optomizer to try and find the optimal variables.


In [11]:

# Initializing in a separate cell so we can easily add more epochs to the same run
def run_model(vocab_size, embedding_dim,
              hidden_size, num_layers, nonlinearity, output_data):

    
    model = Tweets(vocab_size=vocab_size, embedding_dim = embedding_dim,
                   hidden_size = hidden_size, 
                   num_layers= num_layers, nonlinearity = nonlinearity).to('cuda')
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
    
    epoch_number = 0
    
    EPOCHS = 10
    
    best_vloss = 1_000_000.
    
    
    
    for epoch in range(EPOCHS):
        print('EPOCH {}:'.format(epoch_number + 1))
    
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(epoch_number,model, training_loader, loss_fn, optimizer, writer)
    
    
        running_vloss = 0.0
        # Set the model to evaluation mode, disabling dropout and using population
        # statistics for batch normalization.
        model.eval()
    
        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata
                vinputs_cuda = vinputs.to('cuda')
                vlabels_cuda = vlabels.to('cuda')
                voutputs = model(vinputs_cuda)
                
                voutputs = model(vinputs_cuda)
                vloss = loss_fn(voutputs, vlabels_cuda)
                running_vloss += vloss
    
        avg_vloss = running_vloss / (i + 1)
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        
        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                        { 'Training' : avg_loss, 'Validation' : avg_vloss },
                        epoch_number + 1)
        writer.flush()
        
        # Output results for charting
        new_row = {
            "vocab_size": vocab_size,
            "embedding_dim": embedding_dim,
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "loss_fn": str(loss_fn),
            "optimizer": str(optimizer),
            "run": run,
            "epoch": epoch,
            "train_loss": str(avg_loss),
            "val_loss": str(avg_vloss),
            "nonlinearity": nonlinearity
        }
        output_data = pd.concat([output_data, pd.Series(new_row)], ignore_index=True)
        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = 'model_{}_{}'.format(timestamp, epoch_number)
            torch.save(model.state_dict(), model_path)
    
        epoch_number += 1
    print(f'Best Validation Loss {best_vloss}')
    torch.cuda.empty_cache()
    return model

In [12]:
run = 1
embedding_dim = 2048
hidden_size = 2048
num_layers = 1
vocab_size = 30000
nonlinearity = 'tanh'
model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:
LOSS train 0.0071466386914253235 valid 0.682991623878479
EPOCH 2:
LOSS train 0.006806004345417022 valid 0.6834301352500916
EPOCH 3:
LOSS train 0.006853686988353729 valid 0.6841446757316589
EPOCH 4:
LOSS train 0.007271568119525909 valid 0.7044641375541687
EPOCH 5:
LOSS train 0.006895983159542084 valid 0.6880950927734375
EPOCH 6:
LOSS train 0.006478989899158478 valid 0.695449709892273
EPOCH 7:
LOSS train 0.007269836008548736 valid 0.692440390586853
EPOCH 8:
LOSS train 0.006934127986431122 valid 0.6838700771331787
EPOCH 9:
LOSS train 0.006902705550193787 valid 0.6969790458679199
EPOCH 10:
LOSS train 0.0068677057027816775 valid 0.683781623840332
Best Validation Loss 0.682991623878479


This is some very aggressive overfitting.  After research, the embedding DIM is usually between 1 and 300.  I'll try shrinking that.  I'll also shrink the hidden size to 128

In [13]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 128
hidden_size = 128
num_layers = 1
vocab_size = 30000
nonlinearity = 'tanh'

model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:
LOSS train 0.006731974005699158 valid 0.6850181818008423
EPOCH 2:
LOSS train 0.006910143792629242 valid 0.6863357424736023
EPOCH 3:
LOSS train 0.006921864449977875 valid 0.6838652491569519
EPOCH 4:
LOSS train 0.006709359169006348 valid 0.6842689514160156
EPOCH 5:
LOSS train 0.006797512531280517 valid 0.683239221572876
EPOCH 6:
LOSS train 0.0068481147289276125 valid 0.6831526756286621
EPOCH 7:
LOSS train 0.006721559584140778 valid 0.6865381002426147
EPOCH 8:
LOSS train 0.0067502037286758425 valid 0.6832866668701172
EPOCH 9:
LOSS train 0.006849016070365906 valid 0.6831194758415222
EPOCH 10:
LOSS train 0.006732468247413635 valid 0.6854448318481445
Best Validation Loss 0.6831194758415222


No Joy.  I'm going to switch from tanh to something else.  I'll try ReLU

In [14]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 128
hidden_size = 128
num_layers = 1
vocab_size = 30000
nonlinearity = 'relu'

model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:
LOSS train 0.006747014284133911 valid 0.6831364631652832
EPOCH 2:
LOSS train 0.006818036496639252 valid 0.6831680536270142
EPOCH 3:
LOSS train 0.006836028575897217 valid 0.6836048364639282
EPOCH 4:
LOSS train 0.006842085242271424 valid 0.6834427714347839
EPOCH 5:
LOSS train 0.006796495378017425 valid 0.6835532188415527
EPOCH 6:
LOSS train 0.006833365619182586 valid 0.6834239959716797
EPOCH 7:
LOSS train 0.006805239319801331 valid 0.6836296916007996
EPOCH 8:
LOSS train 0.006739068329334259 valid 0.6833828687667847
EPOCH 9:
LOSS train 0.006790339648723602 valid 0.6836575269699097
EPOCH 10:
LOSS train 0.006837983071804047 valid 0.684663712978363
Best Validation Loss 0.6831364631652832


Not alot of learning going due to overfitting.  I was hoping that RelU would fix this.  I'm shrinking the vocab size and trying again.

In [15]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 128
hidden_size = 128
num_layers = 1
vocab_size = 10000
nonlinearity = 'relu'

model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:
LOSS train 0.006715954422950744 valid 0.6846389174461365
EPOCH 2:
LOSS train 0.006837572038173675 valid 0.6835161447525024
EPOCH 3:
LOSS train 0.006747735857963562 valid 0.6856094598770142
EPOCH 4:
LOSS train 0.006821855366230011 valid 0.6836403608322144
EPOCH 5:
LOSS train 0.006831944942474365 valid 0.6838232278823853
EPOCH 6:
LOSS train 0.00688395345211029 valid 0.6835362315177917
EPOCH 7:
LOSS train 0.006676080346107483 valid 0.6843067407608032
EPOCH 8:
LOSS train 0.006884010314941407 valid 0.6846345663070679
EPOCH 9:
LOSS train 0.00694333279132843 valid 0.6834124326705933
EPOCH 10:
LOSS train 0.0068670839071273805 valid 0.6848644018173218
Best Validation Loss 0.6834124326705933


A bit counter intuitive, but I'll try to increase the number of hidden layers.

In [16]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 128
hidden_size = 128
num_layers = 2
vocab_size = 10000
nonlinearity = 'relu'

model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:
LOSS train 0.006926189303398132 valid 0.6837879419326782
EPOCH 2:
LOSS train 0.006769457638263703 valid 0.6836056113243103
EPOCH 3:
LOSS train 0.0068824166655540465 valid 0.6839834451675415
EPOCH 4:
LOSS train 0.006783278703689576 valid 0.6831353902816772
EPOCH 5:
LOSS train 0.006722920477390289 valid 0.6835536360740662
EPOCH 6:
LOSS train 0.006877526760101318 valid 0.6835469007492065
EPOCH 7:
LOSS train 0.00692775696516037 valid 0.6835615634918213
EPOCH 8:
LOSS train 0.006800408601760864 valid 0.6832003593444824
EPOCH 9:
LOSS train 0.006871785044670105 valid 0.6832415461540222
EPOCH 10:
LOSS train 0.006781141102313995 valid 0.6832296252250671
Best Validation Loss 0.6831353902816772


Going back, one of the advantages of the RNN is that is recursive.  I'll change that setting in the model setup and try again.

In [20]:
class Tweets(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, nonlinearity):
        super(Tweets, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                                nonlinearity=nonlinearity, bias=True, bidirectional=True)
        self.output_layer = nn.Linear(embedding_dim, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        rnn_output, _ = self.rnn_layer(embedded)
        output = rnn_output[:, -1, :]  # Extract the last timestep output
        final_output = self.output_layer(output)
        return final_output.squeeze()  # Squeeze to remove the extra dimension

In [21]:
embedding_dim = 128
hidden_size = 128
num_layers = 1
vocab_size = 10000
nonlinearity = 'relu'

model = run_model(vocab_size=vocab_size, 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data)

EPOCH 1:


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x256 and 128x1)