In [1]:
# Keras
import tensorflow as tf
# Non-Keras Loads
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

#Cleaning Loads
import regex as re
import emoji
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import requests
from collections import Counter

#Visualization
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
train_path = "nlp-getting-started/train.csv"
test_path = "nlp-getting-started/test.csv"
train = pd.read_csv(train_path, header = 0)
train = train.drop(labels = ["keyword", "location", "id"], axis = 1)
test = pd.read_csv(test_path, header = 0)
test = test.drop(labels = ["keyword", "location"], axis = 1)
print("train")
print(train.head())
print("test")
print(test.head())

train
                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1
test
   id                                               text
0   0                 Just happened a terrible car crash
1   2  Heard about #earthquake is different cities, s...
2   3  there is a forest fire at spot pond, geese are...
3   9           Apocalypse lighting. #Spokane #wildfires
4  11      Typhoon Soudelor kills 28 in China and Taiwan


# EDA
The dataset has 57% non-disaster tweets, and 43% Disaster tweets.  There are 31924 unique words.  This will drive my tuning the vectorization of the model.

In [6]:
train['target'].describe()

count    7613.00000
mean        0.42966
std         0.49506
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: target, dtype: float64

In [7]:
unique_words = set()
for text in train["text"]:
    unique_words.update(text.split())
print(len(unique_words))

31924


Cleaning

Standard tweet cleaning. Cleaning found at: https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python
Tokenizatin

I tolkenized the tweets in preparation to convert to tensors for embedding.


In [8]:
# Thank you Chatgpt for this
def download_file_from_github(url):
    """Download a file from a GitHub URL and return its contents as a list of lines."""
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.splitlines()  # Split the content into lines
        return lines  # You could change this to `set(lines)` if you need a set instead of a list
    else:
        raise Exception(f"Failed to download file: {response.status_code}")

In [54]:
output = pd.DataFrame()
stop_words = download_file_from_github("https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/raw/stop-words-english1.txt")
contractions = download_file_from_github("https://gist.githubusercontent.com/J3RN/ed7b420a6ea1d5bd6d06/raw/acda66b325a2b4d7282fb602a7551912cdc81e74/contractions.txt")
def cleaning(line):
    tweet = line['text']
    tweet = tweet.lower()
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = emoji.replace_emoji(tweet, '') #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = re.sub(r'[^a-z]', ' ', tweet) # Strip all symbols and replace with +
    tweet = re.sub(r'\b\w{1,2}\b+', '', tweet) #get rid of all words <= 2 characters
    #Tolkenize the Text
    word_tokens = word_tokenize(tweet)
    word_tokens = [w for w in word_tokens if not w in stop_words]
    word_tokens = [w for w in word_tokens if not w in contractions]
    
    #tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    #tweet = tknzr.tokenize(tweet)
    return word_tokens 

train['cleaned_text'] = train.apply(cleaning, axis = 1)
test['cleaned_text'] = test.apply(cleaning, axis = 1)


# Return text to string for processing later
train['cleaned_text'] = train['cleaned_text'].apply(" ".join)
test['cleaned_text'] = test['cleaned_text'].apply(" ".join)


In [55]:
print(train.head(20))
zero_length_lists = train[train['cleaned_text'].apply(lambda x: len(x) == 0)]
print("Rows processed with 0 remaining words")
print(zero_length_lists)

                                                 text  target  \
0   Our Deeds are the Reason of this #earthquake M...       1   
1              Forest fire near La Ronge Sask. Canada       1   
2   All residents asked to 'shelter in place' are ...       1   
3   13,000 people receive #wildfires evacuation or...       1   
4   Just got sent this photo from Ruby #Alaska as ...       1   
5   #RockyFire Update => California Hwy. 20 closed...       1   
6   #flood #disaster Heavy rain causes flash flood...       1   
7   I'm on top of the hill and I can see a fire in...       1   
8   There's an emergency evacuation happening now ...       1   
9   I'm afraid that the tornado is coming to our a...       1   
10        Three people died from the heat wave so far       1   
11  Haha South Tampa is getting flooded hah- WAIT ...       1   
12  #raining #flooding #Florida #TampaBay #Tampa 1...       1   
13            #Flood in Bago Myanmar #We arrived Bago       1   
14  Damage to school bus 

# Additional EDA.  
Thats intresting.  All the items that are all stopwords are going to be no an emergency.

In [56]:
# Split the data into train and test sets
train_dat = train[['cleaned_text','target']].sample(frac = .9)
test_dat = train[['cleaned_text','target']].drop(train_dat.index)

Vectorizing

I used Keras internal vectorizer to vectorize the text. I

In [57]:
train_tf_data = tf.data.Dataset.from_tensor_slices(
    (train_dat['cleaned_text'].values, 
     train_dat['target'].values))
test_tf_data = tf.data.Dataset.from_tensor_slices(
    (test_dat['cleaned_text'].values, 
     test_dat['target'].values))

In [58]:
# https://www.tensorflow.org/text/tutorials/text_classification_rnn

VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_tf_data.map(lambda text, label: text))
vocab = np.array(encoder.get_vocabulary())
print(vocab[:20])
#encoded_example = encoder(train_tf_data)[:3].numpy()
#print(encoded_example)

['' '[UNK]' 'amp' 'fire' 'news' 'people' 'don' 'video' 'disaster'
 'emergency' 'police' 'body' 'california' 'time' 'storm' 'day' 'crash'
 'burning' 'man' 'suicide']


In [59]:
# https://www.tensorflow.org/text/tutorials/text_classification_rnn

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])


In [60]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])


In [64]:
history = model.fit(train_tf_data, epochs=10,
                    validation_data=test_tf_data,
                    validation_steps=30)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


Epoch 1/10


TypeError: 'NoneType' object is not callable