In [63]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [64]:
# Torch loads
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Non-Torch Loads
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

#Cleaning Loads
import regex as re
import emoji
import nltk

In [65]:
train = pd.read_csv("nlp-getting-started/train.csv", header = 0)
train = train.drop(labels = ["keyword", "location", "id"], axis = 1)
print(train.head())
print(train.shape)
print(train.columns)

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1
(7613, 2)
Index(['text', 'target'], dtype='object')


# EDA
The dataset has 57% non-disaster tweets, and 43% Disaster tweets.  There are 31924 unique words.  This will drive my tuning the vectorization of the model.

In [66]:
train['target'].describe()

count    7613.00000
mean        0.42966
std         0.49506
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: target, dtype: float64

In [67]:
unique_words = set()
for text in train["text"]:
    unique_words.update(text.split())
print(len(unique_words))

31924


# Cleaning
Standard tweet cleaning.  Cleaning found at:
https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

# Tokenizatin
I tolkenized the tweets in preparation to convert to tensors for embedding.


In [68]:
vocab = nltk.lm.Vocabulary()
output = pd.DataFrame()
def cleaning(line):
    tweet = line['text']
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = emoji.replace_emoji(tweet, '') #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = re.sub(r'[^a-z]', ' ', tweet) # Strip all symbols and replace with +
    #tweet = re.sub(r'[\w{3,}]+', '', tweet) #get rid of all words <= 2 characters
    #Tolkenize the Text
    tweet = tweet.lower()
    vocab.update(tweet.split(" "))
    #tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    #tweet = tknzr.tokenize(tweet)
    return tweet 
train['cleaned_text'] = train.apply(cleaning, axis = 1)
train['text'] = train['cleaned_text']
train = train.drop(labels = ['cleaned_text'], axis = 1)


In [69]:
#print(list(vocab))

In [70]:
# Split the data into train and test sets
train_dat = train.sample(frac = .9)
test_dat = train.drop(train_dat.index)

In [71]:
vocab_dict = {}
for item in list(vocab):
    vocab_dict[item] = vocab[item]
print(len(vocab_dict))

14665


# Vectorizing
I used pytorchs internal vectorizer to vectorize the text.  I


In [72]:
# Dataset Importing from https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader

#tokenize the text for tensor load
def tokenize(text):
    # Your tokenizer logic here
    return [vocab[token] for token in text.split()]

class CustomTextDataset(Dataset):
    def __init__(self, data,vocab, transform=None, target_transform=None):
        self.data = data
        self.vocab = vocab
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]
        tokenized_tweet = [self.vocab.get(token, 0) for token in tweet.split()]  # Tokenize and handle unknown tokens
        tweet_tensor  = torch.tensor(tokenized_tweet, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)
        
        if self.transform:
            tweet_tensor = self.transform(tweet_tensor)
        if self.target_transform:
            label_tensor = self.target_transform(label_tensor)

        return tweet_tensor, label_tensor

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text.clone().detach())
    return pad_sequence(text_list, batch_first=True), torch.tensor(label_list, dtype=torch.float)

# Creating training and validation datasets
training_set = CustomTextDataset(train_dat, vocab_dict)
validation_set = CustomTextDataset(test_dat, vocab_dict)
#test set = CustomTextDataset(val_dat, vocab)

# Creating data loaders
training_loader = torch.utils.data.DataLoader(training_set, batch_size=32, shuffle=True, collate_fn=collate_batch)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=32, shuffle=False, collate_fn=collate_batch)


# Model approach
I'm choosing to use a RNN were I only select the last value as the output, so many in and 1 out.  

In [73]:
# Initial Setup

class Tweets(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, nonlinearity):
        super(Tweets, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                                nonlinearity=nonlinearity, bias=True, bidirectional=False)
        self.dropout = nn.Dropout(p=.75)
        #self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size, 1)
        
        #self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        rnn_output, _ = self.rnn_layer(embedded)
        dropout = self.dropout(rnn_output)
        output = dropout[:, :, :]  # Extract the last timestep output
        linear = self.linear(output)
        final_output = linear.mean(dim=1)
        #print(final_output.shape)
        return final_output.squeeze()  # Squeeze to remove the extra dimension


In [74]:


def train_one_epoch(epoch_index, model,training_loader, loss_fn, optimizer, tb_writer):
    running_loss = 0.
    last_loss = 0.
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs_cuda = inputs.to('cuda')
        labels_cuda = labels.to('cuda')

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs_cuda)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels_cuda)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 1000 # loss per batch
            #print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

# test execution
#model = Tweets(vocab_size=30000, embedding_dim =1024).to('cuda')
#loss_fn = nn.BCEWithLogitsLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
#tb_writer = SummaryWriter('logs')
#train_one_epoch(1,model, training_loader, loss_fn, optimizer, tb_writer)
#tb_writer.close()

output_data = pd.DataFrame()

Ok, .68 validation loss.  Thats not good.  I'm going to run a optomizer to try and find the optimal variables.


In [75]:

# Initializing in a separate cell so we can easily add more epochs to the same run
def run_model(vocab_size, embedding_dim,
              hidden_size, num_layers, nonlinearity, output_data, epochs = 10):

    
    model = Tweets(vocab_size=vocab_size, embedding_dim = embedding_dim,
                   hidden_size = hidden_size, 
                   num_layers= num_layers, nonlinearity = nonlinearity).to('cuda')
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
    
    epoch_number = 0
    
    best_vloss = 1_000_000.
    
    
    
    for epoch in range(epochs):
        print('EPOCH {}:'.format(epoch_number + 1))
    
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(epoch_number,model, training_loader, loss_fn, optimizer, writer)
    
    
        running_vloss = 0.0
        # Set the model to evaluation mode, disabling dropout and using population
        # statistics for batch normalization.
        model.eval()
    
        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata
                vinputs_cuda = vinputs.to('cuda')
                vlabels_cuda = vlabels.to('cuda')
                voutputs = model(vinputs_cuda)
                
                voutputs = model(vinputs_cuda)
                vloss = loss_fn(voutputs, vlabels_cuda)
                running_vloss += vloss
    
        avg_vloss = running_vloss / (i + 1)
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        
        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                        { 'Training' : avg_loss, 'Validation' : avg_vloss },
                        epoch_number + 1)
        writer.flush()
        
        # Output results for charting
        new_row = {
            "vocab_size": vocab_size,
            "embedding_dim": embedding_dim,
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "loss_fn": str(loss_fn),
            "optimizer": str(optimizer),
            "run": run,
            "epoch": epoch,
            "train_loss": str(avg_loss),
            "val_loss": str(avg_vloss),
            "nonlinearity": nonlinearity
        }
        output_data = pd.concat([output_data, pd.Series(new_row)], ignore_index=True)
        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = 'model_{}_{}'.format(timestamp, epoch_number)
            #torch.save(model.state_dict(), model_path)
    
        epoch_number += 1
    print(f'Best Validation Loss {best_vloss}')
    torch.cuda.empty_cache()
    return model

In [77]:
run = 1
embedding_dim = 64
hidden_size = 64
num_layers = 1
vocab_size = 64
nonlinearity = 'tanh'
model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:
LOSS train 0.006737415671348571 valid 0.6773178577423096
EPOCH 2:
LOSS train 0.0066872814297676084 valid 0.6698739528656006
EPOCH 3:
LOSS train 0.006597165942192077 valid 0.6575964689254761
EPOCH 4:
LOSS train 0.006508150577545166 valid 0.6427116990089417
EPOCH 5:
LOSS train 0.006447979867458343 valid 0.6270498037338257
EPOCH 6:
LOSS train 0.006156076610088349 valid 0.6130244731903076
EPOCH 7:
LOSS train 0.005974364697933197 valid 0.6045231819152832
EPOCH 8:
LOSS train 0.00631960329413414 valid 0.5952804088592529
EPOCH 9:
LOSS train 0.006017493426799774 valid 0.592774510383606
EPOCH 10:
LOSS train 0.005771614491939545 valid 0.588695764541626
EPOCH 11:
LOSS train 0.005776144176721573 valid 0.5893983840942383
EPOCH 12:
LOSS train 0.00584258359670639 valid 0.595373809337616
EPOCH 13:
LOSS train 0.005733406633138657 valid 0.5871719717979431
EPOCH 14:
LOSS train 0.005535050421953201 valid 0.604561448097229
EPOCH 15:
LOSS train 0.00584706711769104 valid 0.5810458660125732
EPOCH 16:


This is some very aggressive overfitting.  After research, the embedding DIM is usually between 1 and 300.  I'll try shrinking that.  I'll also shrink the hidden size to 128

In [81]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 128
hidden_size = 128
num_layers = 1
vocab_size = 30000
nonlinearity = 'tanh'

model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:
LOSS train 0.0066006632447242735 valid 0.6659048199653625
EPOCH 2:
LOSS train 0.00641293329000473 valid 0.651095986366272
EPOCH 3:
LOSS train 0.006378297686576843 valid 0.6264338493347168
EPOCH 4:
LOSS train 0.006140410900115967 valid 0.6097590327262878
EPOCH 5:
LOSS train 0.006155564665794372 valid 0.601556658744812
EPOCH 6:
LOSS train 0.006025470733642578 valid 0.5853719711303711
EPOCH 7:
LOSS train 0.005687238365411759 valid 0.5822876691818237
EPOCH 8:
LOSS train 0.005852213084697723 valid 0.5750299692153931
EPOCH 9:
LOSS train 0.005674470394849777 valid 0.5734764933586121
EPOCH 10:
LOSS train 0.0059062220454216005 valid 0.5759503245353699
EPOCH 11:
LOSS train 0.005705447643995285 valid 0.5779459476470947
EPOCH 12:
LOSS train 0.005604698300361633 valid 0.5784727931022644
EPOCH 13:
LOSS train 0.006115310966968536 valid 0.5824260115623474
EPOCH 14:
LOSS train 0.005391271531581879 valid 0.5769169330596924
EPOCH 15:
LOSS train 0.005829308271408081 valid 0.5875175595283508
EPOCH

No Joy.  I'm going to switch from tanh to something else.  I'll try ReLU

In [82]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 32
hidden_size = 32
num_layers = 1
nonlinearity = 'relu'

model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:
LOSS train 0.006768856287002563 valid 0.6833497285842896
EPOCH 2:
LOSS train 0.006859190702438354 valid 0.680534839630127
EPOCH 3:
LOSS train 0.00674616813659668 valid 0.6781796216964722
EPOCH 4:
LOSS train 0.006859686195850373 valid 0.6732800006866455
EPOCH 5:
LOSS train 0.006767591953277588 valid 0.6681431531906128
EPOCH 6:
LOSS train 0.006643198132514953 valid 0.6611102819442749
EPOCH 7:
LOSS train 0.0063378349542617795 valid 0.65326327085495
EPOCH 8:
LOSS train 0.006465935587882996 valid 0.6417407989501953
EPOCH 9:
LOSS train 0.006419484615325928 valid 0.631295382976532
EPOCH 10:
LOSS train 0.006036305785179138 valid 0.6238367557525635
EPOCH 11:
LOSS train 0.005908311903476715 valid 0.6109274625778198
EPOCH 12:
LOSS train 0.006298208296298981 valid 0.6023268103599548
EPOCH 13:
LOSS train 0.0063184854388237 valid 0.5930426716804504
EPOCH 14:
LOSS train 0.005994602084159851 valid 0.5899097323417664
EPOCH 15:
LOSS train 0.0056389237940311435 valid 0.5910115242004395
EPOCH 16:

Not alot of learning going due to overfitting.  I was hoping that RelU would fix this.  I'm shrinking the vocab size and trying again.

In [83]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 32
hidden_size = 32
num_layers = 2
nonlinearity = 'relu'

model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:
LOSS train 0.006884817123413086 valid 0.6820456385612488
EPOCH 2:
LOSS train 0.006775200486183166 valid 0.6784233450889587
EPOCH 3:
LOSS train 0.006733012616634369 valid 0.6708706617355347
EPOCH 4:
LOSS train 0.006508873879909515 valid 0.6584452390670776
EPOCH 5:
LOSS train 0.0064526451230049135 valid 0.6398960947990417
EPOCH 6:
LOSS train 0.0062542703151702884 valid 0.620219349861145
EPOCH 7:
LOSS train 0.0060516902208328244 valid 0.6071058511734009
EPOCH 8:
LOSS train 0.0063573973774909975 valid 0.6013250946998596
EPOCH 9:
LOSS train 0.005788935840129852 valid 0.5949277877807617
EPOCH 10:
LOSS train 0.005978150069713592 valid 0.5881658792495728
EPOCH 11:
LOSS train 0.005724472910165787 valid 0.5850135684013367
EPOCH 12:
LOSS train 0.005747364342212677 valid 0.5825825333595276
EPOCH 13:
LOSS train 0.005679448515176773 valid 0.5801957249641418
EPOCH 14:
LOSS train 0.006018897324800492 valid 0.5766094923019409
EPOCH 15:
LOSS train 0.005630888521671295 valid 0.5783380270004272
E

A bit counter intuitive, but I'll try to increase the number of hidden layers.

In [84]:
# Initializing in a separate cell so we can easily add more epochs to the same run

run = run+1

embedding_dim = 32
hidden_size = 32
num_layers = 2
vocab_size = 10000
nonlinearity = 'relu'

model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:
LOSS train 0.006865632236003876 valid 0.6830936670303345
EPOCH 2:
LOSS train 0.006851979553699494 valid 0.6799517869949341
EPOCH 3:
LOSS train 0.006662387192249298 valid 0.6714770793914795
EPOCH 4:
LOSS train 0.006688854575157165 valid 0.6555174589157104
EPOCH 5:
LOSS train 0.006489135205745697 valid 0.6390665769577026
EPOCH 6:
LOSS train 0.006297772765159607 valid 0.6252726912498474
EPOCH 7:
LOSS train 0.00639112389087677 valid 0.6120244860649109
EPOCH 8:
LOSS train 0.005783014297485352 valid 0.6005529165267944
EPOCH 9:
LOSS train 0.005871547162532806 valid 0.5932407379150391
EPOCH 10:
LOSS train 0.005842251688241959 valid 0.5905552506446838
EPOCH 11:
LOSS train 0.005498228996992111 valid 0.5869067907333374
EPOCH 12:
LOSS train 0.005487593919038773 valid 0.582530677318573
EPOCH 13:
LOSS train 0.005773815870285034 valid 0.5813212990760803
EPOCH 14:
LOSS train 0.005673999160528183 valid 0.579251766204834
EPOCH 15:
LOSS train 0.005907477021217346 valid 0.5778605937957764
EPOCH 1

Going back, one of the advantages of the RNN is that is recursive.  I'll change that setting in the model setup and try again.

In [85]:
class Tweets(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, nonlinearity):
        super(Tweets, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                                nonlinearity=nonlinearity, bias=True, bidirectional=True)
        self.output_layer = nn.Linear(embedding_dim, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        rnn_output, _ = self.rnn_layer(embedded)
        output = rnn_output[:, -1, :]  # Extract the last timestep output
        final_output = self.output_layer(output)
        return final_output.squeeze()  # Squeeze to remove the extra dimension

In [86]:
embedding_dim = 32
hidden_size = 32
num_layers = 1
vocab_size = 10000
nonlinearity = 'relu'

model = run_model(vocab_size = len(vocab_dict), 
                  embedding_dim = embedding_dim,
                  hidden_size = hidden_size, 
                  num_layers= num_layers, 
                  nonlinearity = nonlinearity,
                  output_data = output_data,
                  epochs = 20)

EPOCH 1:


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x64 and 32x1)