# Natural Language Processing with Disaster Tweets

Detect disaters from people's tweets

In [1]:
import torch
import os
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string
import random
import pandas as pd
from nltk.tokenize import RegexpTokenizer

regexp = RegexpTokenizer("[\w']+")
pd.set_option('display.max_colwidth', None)


### Load Data : 

In [2]:
train_df = pd.read_csv("../data/train.csv")
never_seen_df = pd.read_csv("../data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [4]:
never_seen_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


### Preprocessing : 

In [5]:


acronyms_dict = pd.read_json("../data/acronyms.json", typ = 'series')
acronyms_list = list(acronyms_dict.keys())
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted


contractions_dict = pd.read_json("../data/contractions.json", typ = 'series')
contractions_list = list(contractions_dict.keys())
# Function to convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted




def transform(text):
    # Removing other unicode characters
    def remove_http(text):
        http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
        pattern = r"({})".format(http) # creating pattern
        return re.sub(pattern, "", text)
    def remove_punctuation(text):
        punct_str = string.punctuation
        punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
        return text.translate(str.maketrans("", "", punct_str))
    
    # remove leading space
    text = text.strip()
    # convert to lowercase
    text = text.lower()
    # delete back to new line
    text = re.sub("\n", "", text)
    text = remove_http(text)
    text = text.replace("  ", "")
    # text = remove_punctuation(text)
    text = convert_acronyms(text)
    text = convert_contractions(text)
    
    return text



In [6]:
text_example = train_df["text"][0] + " dob" + " ain't"
print(f"raw text : {text_example}\npreprocessed text : {transform(text_example)}")

raw text : Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all dob ain't
preprocessed text : our deeds are the reason of this earthquake may allah forgive us all date of birth are not


In [7]:
## preprocessing applied
train_df["text"] = train_df.text.apply(lambda x : transform(x))
never_seen_df["text"] = never_seen_df.text.apply(lambda x : transform(x))
# Get tweets and targets
tweets, targets = list(train_df["text"]), list(train_df["target"])

## model and dataset

In [8]:
class TextClassificationDataset(Dataset):
    
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
            
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
    
    
class BERTClassifier(nn.Module):
    
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits
        
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            proba = torch.max(nn.Softmax(dim=1)(outputs), dim=1).values.tolist()[0]
    return f"{round(proba, 2)}% Disaster" if preds.item() == 1 else f"{round(proba, 2)}% No Disater"

def predict_batch(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return preds

## Training : 

In [9]:
# Set up parameters
bert_model_name = '../data/bert_large_uncased/'
num_classes = 2
max_length = 128
batch_size = 25
num_epochs = 3
learning_rate =  1e-5 #
test_ratio=0.2
device = torch.device("cuda:0")  

# split the dataset to train and validation : 
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, targets, test_size=test_ratio, random_state=42)
print(f"{len(val_labels)=}")

len(val_labels)=1523


In [10]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
# prepare dataset :
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# load Bert model
model = BERTClassifier(bert_model_name, num_classes).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of the model checkpoint at ../data/bert_large_uncased/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/3
Validation Accuracy: 0.8194
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       874
           1       0.79      0.79      0.79       649

    accuracy                           0.82      1523
   macro avg       0.82      0.82      0.82      1523
weighted avg       0.82      0.82      0.82      1523

Epoch 2/3
Validation Accuracy: 0.8391
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       874
           1       0.87      0.73      0.79       649

    accuracy                           0.84      1523
   macro avg       0.85      0.83      0.83      1523
weighted avg       0.84      0.84      0.84      1523

Epoch 3/3
Validation Accuracy: 0.8372
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       874
           1       0.85      0.76      0.80       649

    accuracy                           0.84      1523
   macro avg  

In [12]:

texts =random.sample(list(never_seen_df['text']), k=20)

for text in texts : 
    prediction = predict_sentiment(text, model, tokenizer, device, max_length=128)
    print(f"{prediction=}  {text=}")


prediction='0.9% No Disater'  text='leaving back to sf friday have not packed one single thing 911 emergency'
prediction='0.68% Disaster'  text='thought it was a drought'
prediction='0.59% No Disater'  text='tarekfatah you are burning in enemity of pakistan i m sure you will burn more and more'
prediction='0.98% Disaster'  text='american weapons and support are fueling a bloody air war in yemen'
prediction='0.97% Disaster'  text='to whom we shld ask tht from where this bldy pak terrorist has entered in our country'
prediction='0.97% No Disater'  text='i liked a youtube video from centraluploadoh oh'
prediction='0.99% Disaster'  text='six palestinians kidnapped in west bank hebron home demolished international middle east media center'
prediction='0.97% Disaster'  text='globalwarming u s forest service says spending more than half of budget on fires û_united states the agen'
prediction='0.92% No Disater'  text="ashwilliams1 continues to be the best guest on iloveggletters this week's ep

## Submission : 

In [13]:
text = list(never_seen_df['text'])

preds = predict_batch(text, model, tokenizer, device)
preditions = preds.tolist()
sample_submission = pd.read_csv("../data/sample_submission.csv")
sample_submission["target"] = preditions
os.remove("../data/submission.csv")
sample_submission.to_csv("../data/submission.csv", index=False)
