In [2]:
import pandas as pd 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import re 

import torch
import torch.nn as nn
import torch.optim as optim
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# Disaster Tweets Analysis 

https://github.com/himansharma21/NLP

We will attempt to create a model to predict if a given tweet was written about a natural disaster. 


https://www.kaggle.com/c/nlp-getting-started/overview

In [3]:
data = pd.read_csv(
    'train.csv',          
)

data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data Cleaning

Natural language data — particularly tweets — often requires extensive preprocessing due to its informal and inconsistent structure.  
In this case, our cleaning steps will include:

- Converting all text to **lowercase** to ensure uniformity.
- **Removing mentions** (e.g., `@username`), which typically do not carry meaningful information for our modeling task.
- **Eliminating extra whitespace** to standardize the formatting.

These cleaning steps help simplify the text, reduce noise, and ensure that the model focuses on the most relevant linguistic content.


In [4]:
url_pattern = r"http\S+|www\S+|https\S+"
mention_pattern = r"@\w+"

def clean_text(text):
    text = text.lower()
    text = re.sub(url_pattern, "", text)
    text = re.sub(mention_pattern, "", text)
    text = re.sub(r"\s+", " ", text) 
    return text.strip()

data["cleaned_text"] = data["text"].astype(str).apply(clean_text)

In [43]:
data.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


### Cleaned Text

At this stage, we have successfully generated a clean version of the text data.  
As a result, the original raw text column is no longer needed and can be safely removed.  
Additionally, the **keyword** and **location** columns are not required for our modeling purposes, so we will drop these columns as


In [44]:
data_clean = data.drop(columns=["keyword","location","id","text"])
data_clean.head()

Unnamed: 0,target,cleaned_text
0,1,our deeds are the reason of this #earthquake m...
1,1,forest fire near la ronge sask. canada
2,1,all residents asked to 'shelter in place' are ...
3,1,"13,000 people receive #wildfires evacuation or..."
4,1,just got sent this photo from ruby #alaska as ...


# Training

### Train-Test Split

To prepare for model training, we will divide the dataset into an **80% training set** and a **20% test set**.  


In [5]:
from nltk.tokenize import TweetTokenizer
from collections import Counter


X_train, X_test = train_test_split(
    data, 
    test_size=0.2,       
    random_state=42,      
    stratify=data['target'] 
)

### Tokenizer

To process the natural language text for modeling, we require a tokenizer that can appropriately segment the text into individual tokens.  
Given the nature of our data we will use the **`TweetTokenizer`** from the **NLTK** library, which is specifically designed to handle the unique characteristics of tweets, such as hashtags, mentions, and emoticons.

`TweetTokenizer` is a rule based tokenizer which handles tweets well. It has built in support for emojis and common twitter slang. Despite not being pretrained, `TweetTokenizer` is recognized as an effective for tweet processing.

https://www.nltk.org/api/nltk.tokenize.casual.html


In [22]:
def tokenize(X_train,  max_len = 100):
    tokenizer = TweetTokenizer()

    tokenized_texts = [tokenizer.tokenize(text) for text in X_train['cleaned_text']]

    all_tokens = [token for tokens in tokenized_texts for token in tokens]
    vocab = {token: idx + 1 for idx, (token, _) in enumerate(Counter(all_tokens).most_common())}  # +1 because 0 will be padding

    encoded_texts = [[vocab[token] for token in tokens if token in vocab] for tokens in tokenized_texts]

    padded_texts = np.zeros((len(encoded_texts), max_len), dtype=int)
    for i, seq in enumerate(encoded_texts):
        length = min(len(seq), max_len)
        padded_texts[i, :length] = seq[:length]

    X_tensor = torch.tensor(padded_texts, dtype=torch.long)
    y_tensor = torch.tensor(X_train['target'].values, dtype=torch.float32)

    dataset = TensorDataset(X_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

    return vocab, dataloader


vocab, dataloader = tokenize(X_train)


### LSTM Model Overview

We use an LSTM for this task because tweets are sequential text data, and LSTMs are effective at capturing temporal relationships and dependencies between words, which improves classification performance on natural language.


In [23]:

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=64, output_dim=1):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return self.sigmoid(out).squeeze()
    


def train_model(model, epochs, optimizer, criterion):

    for epoch in range(epochs):
        model.train()
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    return model 
    


def evaluate(model, X_test, vocab, max_len = 100, batch_size=32):
    model.eval()
    tokenizer = TweetTokenizer()
    
    tokenized_texts = [tokenizer.tokenize(text) for text in X_test['cleaned_text']]
    encoded_texts = [[vocab[token] for token in tokens if token in vocab] for tokens in tokenized_texts]
    
    padded_texts = np.zeros((len(encoded_texts), max_len), dtype=int)
    for i, seq in enumerate(encoded_texts):
        length = min(len(seq), max_len)
        padded_texts[i, :length] = seq[:length]

    X_tensor = torch.tensor(padded_texts, dtype=torch.long)
    y_tensor = torch.tensor(X_test['target'].values, dtype=torch.float32)


    test_dataset = TensorDataset(X_tensor, y_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = (outputs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    return accuracy



### Initial Model Performance

Our initial model shows a poor performance of 42.9%. We will attempt hyperparamter tuning to increase our model accuracy.

In [24]:
model = LSTM(vocab_size = len(vocab), hidden_dim=64)


criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

model = train_model(model=model,epochs=5, criterion=criterion, optimizer=optimizer)


acc = evaluate(model, X_test, vocab)
print("Model accuracy: {}".format(acc))

Model accuracy: 0.4294156270518713


### Hyperparameter Tuning

Changing the optimizer from **SGD** to **Adam** resulted in an increase in test accuracy. This suggests that, for this specific model and dataset, **Adam** may be better suited.

In [25]:
vocab, dataloader = tokenize(X_train)

model = LSTM(vocab_size = len(vocab), hidden_dim=64)


criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

model = train_model(model=model,epochs=1, criterion=criterion, optimizer=optimizer)


acc = evaluate(model, X_test, vocab)
print("Model accuracy: {}".format(acc))

Model accuracy: 0.5705843729481287


# Conclusion

Our hyperparameter tuning efforts resulted in a modest improvement in model performance.  
To achieve further gains, future work would likely need to focus on either modifying the dataset — such as through additional preprocessing or feature engineering — or making fundamental changes to the model architecture itself to better capture the underlying patterns in the data.
