<style>
    
    .rendered_html p {
        font-size: 16px;
    }
</style>


This data set is from Kaggle's competition "Natural Language Processing with Disaster Tweets" from https://www.kaggle.com/competitions/nlp-getting-started.

In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import random
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [109]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/kunli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
pd.options.display.max_colwidth = 100


## Read data and explore

In [2]:
df = pd.read_csv("train.csv")

In [24]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [38]:
# check disaster and non-disaster counts 

df.groupby('target').count()

Unnamed: 0_level_0,id,keyword,location,text,hashtag
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4342,4323,2884,4342,885
1,3271,3229,2196,3271,858


#### I think hashtag # is important, espeically the texts after #, so I extracted the words after # in each tweet

In [33]:
df['hashtag'] = df['text'].str.extractall(r'#(\w+)').groupby(level=0).agg(' '.join)

In [34]:
# separate positive and negative tweets to analyze

positive_tweet = df[df['target'] == 1]
negative_tweet = df[df['target'] == 0]

In [37]:
missing_values_by_group = df.groupby('target')['hashtag'].apply(lambda x: x.isnull().sum())
missing_values_by_group

target
0    3457
1    2413
Name: hashtag, dtype: int64

## Pre-processing

In [87]:
# clean symbols. reference: https://python.plainenglish.io/nlp-twitter-sentiment-analysis-using-python-ml-4b4a8fc1e2b 

def clean_text(txt):
    txt = txt.lower()
    txt = contractions.fix(txt)
    txt = re.sub(r"RT[\s]+", "", txt)
    txt = txt.replace("\n", " ")
    txt = re.sub(" +", " ", txt)
    txt = re.sub(r"https?:\/\/\S+", "", txt)
    txt = re.sub(r"(@[A-Za-z0–9_]+)|[^\w\s]|#", "", txt)
    #txt = emoji.replace_emoji(txt, replace='')
    tokens = txt.split()
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    txt.join(filtered)
    txt.strip()
    return txt

In [91]:
df['clean'] = df['text'].apply(clean_text)

In [92]:
# remove stopwords, stemming

df

Unnamed: 0,id,keyword,location,text,target,hashtag,clean
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake,our deeds are the reason of this earthquake may allah forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,,all residents asked to shelter in place are being notified by officers no other evacuation or sh...
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires,13000 people receive wildfires evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,Alaska wildfires,just got sent this photo from ruby alaska as smoke from wildfires pours into a school
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,,two giant cranes holding a bridge collapse into nearby homes
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part ...,1,,the out of control wild fires in california even in the northern part of the state very troubling
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,,m194 0104 utc5km s of volcano hawaii
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffer...,1,,police investigating after an ebike collided with a car in little portugal ebike rider suffered ...


In [93]:
# create a new column where numbers are replaced with the word 'quantity'. Because disater tweet always have numbers of people killed, items destroyed. 

def replace_numbers_with_quantity(text):
    # Use regular expression to match numbers
    pattern = r'\b\d+\b'
    replaced_text = re.sub(pattern, 'quantity', text)
    return replaced_text

# Test the function
df['no_number'] = df['clean'].apply(replace_numbers_with_quantity)


## Modeling

Reference https://www.kaggle.com/code/artemzapara/twitter-feeds-classification-with-glove-embeddings

In [100]:
# get word embeddings from GloVe's twitter vector https://nlp.stanford.edu/projects/glove/

path_to_glove_file = 'glove.twitter.27B.200d.txt'

embeddings_index = {}

f = open(path_to_glove_file, 'r', encoding='utf8')

for line in f:
    splitLine = line.split(' ')
    word = splitLine[0]                                  # the first entry is the word
    coefs = np.asarray(splitLine[1:], dtype='float32')   # these are the vectors representing word embeddings
    embeddings_index[word] = coefs
print("Glove data loaded! In total:",len(embeddings_index)," words.")

embeddings_index['wildfire']

Glove data loaded! In total: 1193514  words.


array([-0.074481 ,  0.46586  , -0.21778  ,  0.0045365, -0.95808  ,
       -0.35728  , -0.47167  , -0.55515  , -0.084384 , -0.32704  ,
       -0.040345 , -0.77329  , -0.51059  ,  0.26916  , -0.1164   ,
       -0.25205  ,  0.5357   , -0.04781  ,  0.044159 ,  0.67384  ,
       -0.60659  , -0.19984  , -0.052206 , -0.080398 , -0.47106  ,
        0.97058  , -0.23209  , -0.43689  ,  0.22923  ,  0.70862  ,
       -0.26816  ,  0.055069 ,  0.088242 ,  0.11365  , -0.3306   ,
       -0.49205  ,  0.090598 ,  0.026713 , -0.2069   ,  0.32853  ,
        0.4126   ,  0.052834 , -1.0666   , -0.076318 , -0.42735  ,
        0.51737  ,  0.2213   ,  0.52193  , -0.48834  , -0.45973  ,
        0.55082  ,  0.21589  , -0.0018647, -0.065373 , -0.93301  ,
        0.39697  , -0.44072  ,  0.90262  ,  0.010177 ,  0.60577  ,
       -0.28581  ,  1.0762   , -0.52111  ,  0.099238 , -0.015993 ,
        0.050768 ,  0.32227  ,  0.0712   , -0.45601  , -0.40253  ,
       -0.40376  ,  0.16402  ,  0.69729  , -0.15296  ,  0.0264

In [101]:
# reference: https://www.kaggle.com/code/artemzapara/twitter-feeds-classification-with-glove-embeddings

def train_val_split(df, text_col, target_col, validation_split):
    """
    This function generates the training and validation splits from an input dataframe
    
    Parameters:
        dataframe: pandas dataframe with columns "text" and "target" (binary)
        validation_split: should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the validation split
        text_col, target_col: column names for the text that needs classified and the target as the labels
    
    Returns:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset      
    """
       
    text = df[text_col].values.tolist()                         # input text as list
    targets = df[target_col].values.tolist()                    # targets
    
#   Preparing the training/validation datasets
    
    seed = random.randint(1,50)   # random integer in a range (1, 50)
    rng = np.random.RandomState(seed)
    rng.shuffle(text)
    rng = np.random.RandomState(seed)
    rng.shuffle(targets)

    num_validation_samples = int(validation_split * len(text))

    train_samples = text[:-num_validation_samples]
    val_samples = text[-num_validation_samples:]
    train_labels = targets[:-num_validation_samples]
    val_labels = targets[-num_validation_samples:]
    
    print(f"Total size of the dataset: {df.shape[0]}.")
    print(f"Training dataset: {len(train_samples)}.")
    print(f"Validation dataset: {len(val_samples)}.")
    
    return train_samples, val_samples, train_labels, val_labels

In [104]:
train_samples, val_samples, train_labels, val_labels = train_val_split(df, 'no_number', 'target', 0.15)

Total size of the dataset: 7613.
Training dataset: 6472.
Validation dataset: 1141.


In [110]:
def make_embedding_matrix(train_samples, val_samples, embeddings_index):
    """
    This function computes the embedding matrix that will be used in the embedding layer
    
    Parameters:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        embeddings_index: Python dictionary with word embeddings
    
    Returns:
        embedding_matrix: embedding matrix with the dimensions (num_tokens, embedding_dim), where num_tokens is the vocabulary of the input data, and emdebbing_dim is the number of components in the GloVe vectors (can be 50,100,200,300)
        vectorizer: TextVectorization layer      
    """
    
    vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=50)
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
    vectorizer.adapt(text_ds)
    
    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
      
    num_tokens = len(voc)
    
    hits = 0
    misses = 0

#   creating an embedding matrix
    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
#     print("Converted %d words (%d misses)" % (hits, misses))
    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, vectorizer

In [111]:
embedding_matrix, vectorizer = make_embedding_matrix(train_samples, val_samples, embeddings_index)

Converted 11277 words (2797 misses).


In [116]:
def initialize_nn(embedding_matrix):
    """
    This function initializes Keras model for binary text classification
    
    Parameters:
        embedding matrix with the dimensions (num_tokens, embedding_dim), where num_tokens is the vocabulary size of the input data, and emdebbing_dim is the number of components in the GloVe vectors
    
    Returns:
        model: Keras model    
    """
    
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False,                # we are not going to train the embedding vectors
    )
    
#   Here we define the architecture of the Keras model. 
    int_sequences_input = keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(0.5)(x)
    x = layers.LSTM(128,return_sequences=True)(x)
    x = layers.Conv1D(128, 3, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    preds = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model

In [117]:
initial_model = initialize_nn(embedding_matrix)
initial_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 200)         2814800   
                                                                 
 dropout (Dropout)           (None, None, 200)         0         
                                                                 
 lstm (LSTM)                 (None, None, 128)         168448    
                                                                 
 conv1d (Conv1D)             (None, None, 128)         49280     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                             

In [120]:
def train_nn(model, train_samples, val_samples, train_labels, val_labels, vectorizer, stop = False):
    """
    This function fits the training data using validation data to calculate metrics.
    
    Parameters:
        model: preinitialized Keras model
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset
        vectorizer: TextVectorization layer
        stop (Boolean): flag for Early Stopping (aborting training when a monitored metric has stopped improving)
    
    Returns:
        model: trained Keras model
        history: callback that can be used to track the learning process
    """
    
    print('')
    print("Training the model...")
    
    model.compile(loss="binary_crossentropy", 
              optimizer="adam", 
              metrics=["acc"])
    
    x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
    
    y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
    y_val = np.asarray(val_labels).astype('float32').reshape((-1,1))
    
    if stop:
        early_stopping = EarlyStopping(monitor='val_loss', patience=1)
        history = model.fit(x_train, y_train, batch_size=32, epochs=40, validation_data=(x_val, y_val), callbacks=[early_stopping], verbose=1)
    else:
        history = model.fit(x_train, y_train, batch_size=32, epochs=40, validation_data=(x_val, y_val), verbose=1)
        
    return model, history

In [121]:
model, history = train_nn(initial_model, train_samples, val_samples, train_labels, val_labels, vectorizer, stop=False)


Training the model...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


### Prediction

In [123]:
def suggest_nn(df, model):
    """
    This function generates (binary) targets from a dataframe with column "text" using trained Keras model
    
    Parameters:
        df: pandas dataframe with column "text"
        model: Keras model (trained)
    
    Output:
        predictions: list of suggested targets corresponding to string entries from the column "text"
    """
    
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)
    
    probabilities = end_to_end_model.predict(df["no_number"])
    
    predictions = [1 if i > 0.5 else 0 for i in probabilities]
    
    return predictions

In [124]:
test = pd.read_csv('test.csv')

In [125]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn
3259,10865,,,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it wa...
3260,10868,,,Green Line derailment in Chicago http://t.co/UtbXLcBIuY
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3


In [126]:
test['clean'] = test['text'].apply(clean_text)

In [127]:
test['no_number'] = test['clean'].apply(replace_numbers_with_quantity)

In [128]:
predictions = suggest_nn(test, model)





In [129]:
submission_data = {"Id": test.id.tolist(), "target": predictions}

submission_df = pd.DataFrame(submission_data)

submission_df.to_csv('submission.csv', index=False)