In [1]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [2]:
# Torch loads
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torcheval.metrics.functional import binary_accuracy

# Non-Torch Loads
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

#Cleaning Loads
import regex as re
import emoji
import nltk
from nltk.tokenize import word_tokenize
import requests

#Visualization
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

  _torch_pytree._register_pytree_node(


In [3]:
train_path = "nlp-getting-started/train.csv"
test_path = "nlp-getting-started/test.csv"
train = pd.read_csv(train_path, header = 0)
train = train.drop(labels = ["keyword", "location", "id"], axis = 1)
test = pd.read_csv(test_path, header = 0)
test = test.drop(labels = ["keyword", "location"], axis = 1)
print("train")
print(train.head())
print("test")
print(test.head())

train
                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1
test
   id                                               text
0   0                 Just happened a terrible car crash
1   2  Heard about #earthquake is different cities, s...
2   3  there is a forest fire at spot pond, geese are...
3   9           Apocalypse lighting. #Spokane #wildfires
4  11      Typhoon Soudelor kills 28 in China and Taiwan


# EDA
The dataset has 57% non-disaster tweets, and 43% Disaster tweets.  There are 31924 unique words.  This will drive my tuning the vectorization of the model.

In [4]:
train['target'].describe()

count    7613.00000
mean        0.42966
std         0.49506
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: target, dtype: float64

In [5]:
unique_words = set()
for text in train["text"]:
    unique_words.update(text.split())
print(len(unique_words))

31924


# Cleaning
Standard tweet cleaning.  Cleaning found at:
https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

# Tokenizatin
I tolkenized the tweets in preparation to convert to tensors for embedding.


In [6]:
# Thank you Chatgpt for this
def download_file_from_github(url):
    """Download a file from a GitHub URL and return its contents as a list of lines."""
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.splitlines()  # Split the content into lines
        return lines  # You could change this to `set(lines)` if you need a set instead of a list
    else:
        raise Exception(f"Failed to download file: {response.status_code}")

In [7]:
vocab = nltk.lm.Vocabulary()
output = pd.DataFrame()
stop_words = download_file_from_github("https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/raw/stop-words-english1.txt")
contractions = download_file_from_github("https://gist.githubusercontent.com/J3RN/ed7b420a6ea1d5bd6d06/raw/acda66b325a2b4d7282fb602a7551912cdc81e74/contractions.txt")
def cleaning(line):
    tweet = line['text']
    tweet = tweet.lower()
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = emoji.replace_emoji(tweet, '') #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = re.sub(r'[^a-z]', ' ', tweet) # Strip all symbols and replace with +
    tweet = re.sub(r'\b\w{1,2}\b+', '', tweet) #get rid of all words <= 2 characters
    #Tolkenize the Text
    word_tokens = word_tokenize(tweet)
    word_tokens = [w for w in word_tokens if not w in stop_words]
    word_tokens = [w for w in word_tokens if not w in contractions]
    #tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    #tweet = tknzr.tokenize(tweet)
    vocab.update(word_tokens)
    return word_tokens 
train['cleaned_text'] = train.apply(cleaning, axis = 1)
train['text'] = train['cleaned_text']
train = train.drop(labels = ['cleaned_text'], axis = 1)
test['cleaned_text'] = test.apply(cleaning, axis = 1)
test['text'] = test['cleaned_text']
test = test.drop(labels = ['cleaned_text'], axis = 1)

In [8]:
print(train.head(20))
zero_length_lists = train[train['text'].apply(lambda x: len(x) == 0)]
print("Rows processed with 0 remaining words")
print(zero_length_lists)

                                                 text  target
0         [deeds, reason, earthquake, allah, forgive]       1
1                 [forest, fire, ronge, sask, canada]       1
2   [residents, asked, shelter, place, notified, o...       1
3   [people, receive, wildfires, evacuation, order...       1
4   [photo, ruby, alaska, smoke, wildfires, pours,...       1
5   [rockyfire, update, california, hwy, closed, d...       1
6   [flood, disaster, heavy, rain, flash, flooding...       1
7                            [top, hill, fire, woods]       1
8   [emergency, evacuation, happening, building, s...       1
9                     [afraid, tornado, coming, area]       1
10                         [people, died, heat, wave]       1
11  [haha, south, tampa, flooded, hah, wait, live,...       1
12  [raining, flooding, florida, tampabay, tampa, ...       1
13              [flood, bago, myanmar, arrived, bago]       1
14  [damage, school, bus, multi, car, crash, break...       1
15      

# Additional EDA.  
Thats intresting.  All the items that are all stopwords are going to be no an emergency.

In [9]:
train = train[train['text'].apply(lambda x: len(x) != 0)]

In [10]:
# Split the data into train and test sets
train_dat = train.sample(frac = .9)
test_dat = train.drop(train_dat.index)

In [11]:
vocab_dict = {}
for item in list(vocab):
    vocab_dict[item] = vocab[item]
print(len(vocab_dict))

16548


# Vectorizing
I used pytorchs internal vectorizer to vectorize the text.  I


In [12]:
# Dataset Importing from https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader

#tokenize the text for tensor load
def tokenize(text):
    # Your tokenizer logic here
    return [vocab[token] for token in text.split()]

class CustomTextDataset(Dataset):
    def __init__(self, data,vocab, transform=None, target_transform=None):
        self.data = data
        self.vocab = vocab
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]
        tokenized_tweet = [self.vocab.get(token, 0) for token in tweet]  # Tokenize and handle unknown tokens
        tweet_tensor  = torch.tensor(tokenized_tweet, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)
        
        if self.transform:
            tweet_tensor = self.transform(tweet_tensor)
        if self.target_transform:
            label_tensor = self.target_transform(label_tensor)

        return tweet_tensor, label_tensor

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text.clone().detach())
    return pad_sequence(text_list, batch_first=True), torch.tensor(label_list, dtype=torch.float)

# Creating training and validation datasets
training_set = CustomTextDataset(train_dat, vocab_dict)
validation_set = CustomTextDataset(test_dat, vocab_dict)
#test set = CustomTextDataset(val_dat, vocab)

# Creating data loaders
training_loader = torch.utils.data.DataLoader(training_set, batch_size=32, shuffle=True, collate_fn=collate_batch)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=32, shuffle=False, collate_fn=collate_batch)


# Model approach
I'm choosing to use a RNN were I only select the last value as the output, so many in and 1 out.  

In [25]:
# Initial Setup
best_model_params = []
class Tweets_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, nonlinearity):
        super(Tweets_RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_layer = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                                nonlinearity=nonlinearity, bias=True, bidirectional=True)
        # Add TanH Layer
        
        self.dropout = nn.Dropout(p=.75)
        #self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size * 2, 1)
        # Step Down linear layers?
        #self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        rnn_output, _ = self.rnn_layer(embedded)
        dropout = self.dropout(rnn_output)
        output = dropout[:, :, :]
        linear = self.linear(output)
        final_output = linear.mean(dim=1)
        #print(final_output.shape)
        return final_output.squeeze()  # Squeeze to remove the extra dimension


In [26]:


def train_one_epoch(epoch_index, model,training_loader, loss_fn, optimizer, tb_writer):
    running_loss = 0.
    last_loss = 0.
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs_cuda = inputs.to('cuda')
        labels_cuda = labels.to('cuda')

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs_cuda)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels_cuda)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 1000 # loss per batch
            #print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0

    return last_loss

# test execution
#model = Tweets(vocab_size=30000, embedding_dim =1024).to('cuda')
#loss_fn = nn.BCEWithLogitsLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
#tb_writer = SummaryWriter('logs')
#train_one_epoch(1,model, training_loader, loss_fn, optimizer, tb_writer)
#tb_writer.close()

output_data = pd.DataFrame()

Ok, .68 validation loss.  Thats not good.  I'm going to run a optomizer to try and find the optimal variables.


In [72]:
best_global_vloss = 1

# Initializing in a separate cell so we can easily add more epochs to the same run
def run_model(output_data, model, epochs = 20):
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
    
    epoch_number = 0
    
    best_vloss = 1.
    best_vacc = 0
    best_vacc_count = 0
    
    for epoch in range(epochs):
        print('EPOCH {}:'.format(epoch_number + 1))
    
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(epoch_number,model, training_loader, loss_fn, optimizer, writer)
        
    
        running_vloss = 0.0
        running_vacc = 0.0
        # Set the model to evaluation mode, disabling dropout and using population
        # statistics for batch normalization.
        model.eval()
        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata
                vinputs_cuda = vinputs.to('cuda')
                vlabels_cuda = vlabels.to('cuda')
                voutputs = model(vinputs_cuda)
                vloss = loss_fn(voutputs, vlabels_cuda)
                vacc = binary_accuracy(voutputs, vlabels_cuda)
                running_vloss += vloss
                running_vacc += vacc.item()
                
        avg_vloss = running_vloss / (i + 1)
        avg_vacc = running_vacc / (i + 1)
        print('LOSS train {} valid {} Binary Acc {}'.format(avg_loss, avg_vloss, avg_vacc))
        
        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                        { 'Training' : avg_loss, 'Validation' : avg_vloss},
                        epoch_number + 1)
        writer.flush()
        new_row = {
            "run": run_num,
            "train_loss": str(avg_loss),
            "val_loss": str(avg_vloss),
            "val_acc": avg_vacc
        }
        output_data = pd.concat([output_data, pd.Series(new_row)], ignore_index=True)
        # Track best performance, and save the model's state
        global best_global_vloss
        global best_model_params
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
        if avg_vloss < best_global_vloss:
            model_path = 'best_model.model'
            best_global_vloss = avg_vloss
            torch.save(model.state_dict(), model_path)
        #Calculate Best VAccuracy
        if best_vacc < avg_vacc:
            best_vacc = avg_vacc
            best_vacc_count = 0
        else: 
            best_vacc_count += 1

        #Cutoff the model if the VACC goest down after 2 cycles
        #if best_vacc_count == 3:
            #break
        
        epoch_number += 1
    print(f'Best Validation Loss {best_vloss}  Best Validation Accuracy {best_vacc}')
    torch.cuda.empty_cache()
    return model

In [None]:

run_num = 1
embedding_dim = 64
hidden_size = 64
num_layers = 2
nonlinearity = 'relu'

model = Tweets_RNN(vocab_size=len(vocab_dict), embedding_dim = embedding_dim,
                   hidden_size = hidden_size, num_layers= num_layers,
                   nonlinearity = nonlinearity
                  ).to('cuda')

run = run_model(  output_data = output_data,
                  epochs = 40, model = model)

EPOCH 1:
LOSS train 0.006784742295742035 valid 0.6564794778823853 Binary Acc 0.6143229169150194
EPOCH 2:
LOSS train 0.006382430672645569 valid 0.6174231767654419 Binary Acc 0.618593749900659
EPOCH 3:
LOSS train 0.0060724313855171205 valid 0.5975783467292786 Binary Acc 0.6764062494039536
EPOCH 4:
LOSS train 0.005368231505155563 valid 0.585707426071167 Binary Acc 0.6563541665673256
EPOCH 5:
LOSS train 0.005397511601448059 valid 0.563495397567749 Binary Acc 0.6753124992052714
EPOCH 6:
LOSS train 0.005509853363037109 valid 0.5580012798309326 Binary Acc 0.7085937509934107
EPOCH 7:
LOSS train 0.0053938995599746705 valid 0.5517502427101135 Binary Acc 0.7072916676600774
EPOCH 8:
LOSS train 0.00611026644706726 valid 0.5468348264694214 Binary Acc 0.7111979176600774
EPOCH 9:
LOSS train 0.005376096844673156 valid 0.5433369874954224 Binary Acc 0.7020833343267441
EPOCH 10:
LOSS train 0.005261266320943833 valid 0.5419866442680359 Binary Acc 0.7085937509934107
EPOCH 11:
LOSS train 0.005532274186611175

In [None]:
class Tweets_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, bias, num_layers, dropout):
        super(Tweets_LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm_layer = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, 
                                  num_layers=num_layers,bias = True,
                                  bidirectional=True, dropout = dropout)
        # Add TanH Layer
        
        self.dropout = nn.Dropout(p=.75)
        #self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size * 2, 1)
        # Step Down linear layers?
        #self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        lstm_output, _ = self.lstm_layer(embedded)
        dropout = self.dropout(lstm_output)
        output = dropout[:, :, :]
        linear = self.linear(output)
        final_output = linear.mean(dim=1)
        #print(final_output.shape)
        return final_output.squeeze()  # Squeeze to remove the extra dimension

In [88]:

run_num = 2
embedding_dim = 64
hidden_size = 64
num_layers = 2

model = Tweets_LSTM(vocab_size=len(vocab_dict), 
                    embedding_dim = embedding_dim,
                    hidden_size = hidden_size, 
                    num_layers= num_layers, 
                    dropout = 0.5, bias = True
                  ).to('cuda')

run = run_model(  output_data = output_data,
                  epochs = 40, model = model)

EPOCH 1:
LOSS train 0.006653462767601013 valid 0.6654738783836365 Binary Acc 0.5691145832339922
EPOCH 2:
LOSS train 0.006608706593513489 valid 0.6449795961380005 Binary Acc 0.595156249900659
EPOCH 3:
LOSS train 0.006077652394771576 valid 0.6134889125823975 Binary Acc 0.6280729162196318
EPOCH 4:
LOSS train 0.005860358119010926 valid 0.5942034721374512 Binary Acc 0.650937500099341
EPOCH 5:
LOSS train 0.005917282462120056 valid 0.5878579020500183 Binary Acc 0.6868229160706202
EPOCH 6:
LOSS train 0.006153228044509887 valid 0.5774403810501099 Binary Acc 0.6656250009934107
EPOCH 7:
LOSS train 0.005667667090892792 valid 0.5714709758758545 Binary Acc 0.6734375009934107
EPOCH 8:
LOSS train 0.005991019546985626 valid 0.5699287056922913 Binary Acc 0.6838541676600774
EPOCH 9:
LOSS train 0.005684522300958633 valid 0.5655550956726074 Binary Acc 0.6812500009934107
EPOCH 10:
LOSS train 0.005284210145473481 valid 0.5634011030197144 Binary Acc 0.6851562509934107
EPOCH 11:
LOSS train 0.005533837646245957

In [89]:
class Tweets_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, bias, num_layers, dropout):
        super(Tweets_GRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.GRU_layer = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, 
                                  num_layers=num_layers,bias = True,
                                  bidirectional=True, dropout = dropout)
        # Add TanH Layer
        
        self.dropout = nn.Dropout(p=.75)
        #self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size * 2, 1)
        # Step Down linear layers?
        #self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, input_text):
        # Assuming input_text is already a LongTensor that has been prepared outside the model
        embedded = self.embedding(input_text)
        gru_output, _ = self.GRU_layer(embedded)
        dropout = self.dropout(gru_output)
        output = dropout[:, :, :]
        linear = self.linear(output)
        final_output = linear.mean(dim=1)
        #print(final_output.shape)
        return final_output.squeeze()  # Squeeze to remove the extra dimension

In [90]:
run_num = 3
#embedding_dim = 32
#hidden_size = 32
#num_layers = 1

model = Tweets_GRU(vocab_size=len(vocab_dict), 
                    embedding_dim = embedding_dim,
                    hidden_size = hidden_size, 
                    num_layers= num_layers, 
                    dropout = 0.5, bias = True
                  ).to('cuda')

run = run_model(  output_data = output_data,
                  epochs = 40, model = model)

EPOCH 1:
LOSS train 0.00673525458574295 valid 0.6602927446365356 Binary Acc 0.5691145832339922
EPOCH 2:
LOSS train 0.006640859603881836 valid 0.6548405885696411 Binary Acc 0.602968749900659
EPOCH 3:
LOSS train 0.006240485608577729 valid 0.632247805595398 Binary Acc 0.602968749900659
EPOCH 4:
LOSS train 0.006185948431491852 valid 0.6161451935768127 Binary Acc 0.6405208334326744
EPOCH 5:
LOSS train 0.006211955487728119 valid 0.6016430854797363 Binary Acc 0.654843750099341
EPOCH 6:
LOSS train 0.006010456323623657 valid 0.5885050892829895 Binary Acc 0.670468750099341
EPOCH 7:
LOSS train 0.005710723817348481 valid 0.5827634334564209 Binary Acc 0.6773437509934107
EPOCH 8:
LOSS train 0.0059095139503479 valid 0.584218442440033 Binary Acc 0.6916666676600774
EPOCH 9:
LOSS train 0.006189924955368042 valid 0.5701404809951782 Binary Acc 0.6734375009934107
EPOCH 10:
LOSS train 0.005406829923391342 valid 0.5701776742935181 Binary Acc 0.6760416676600774
EPOCH 11:
LOSS train 0.005909460335969925 valid 

In [80]:
test = pd.read_csv(test_path, header = 0)
model = Tweets(vocab_size = best_model_params["vocab_size"], 
                                 embedding_dim = best_model_params["embedding_dim"],
                                 hidden_size = best_model_params["hidden_size"], 
                                 num_layers = best_model_params["num_layers"],
                                 nonlinearity = best_model_params["nonlinearity"])
model.load_state_dict(torch.load('best_model.model'))
model.eval()  # Set the model to evaluation mode


NameError: name 'Tweets' is not defined