# Natural Language Processing with Disaster Tweets

In [1]:
import re                                                 # regular Expression
import string                                             # used for its string properties
import numpy as np                                        # Numpy for Linear Algebra
import pandas as pd                                       # Pandas for DataFrames
from matplotlib import pyplot as plt                      # Matplotlib for plots
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

from wordcloud import WordCloud                           # Wordclouds

# Sckit-Learn
from sklearn.model_selection import train_test_split

# Keras
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers import Dense, Input, Dropout

# Bert Transformers
from transformers import TFBertModel, BertModel
from transformers import BertTokenizer

# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')  # other options: en_core_web_md, en_core_web_lg

## Load Data

- There are 4 shared features: *id*, *keyword*, *location*, *text*
- There is an predictive feature: *target*
- There are 7613 entries for *training data* and 3263 for *test data*.

In [2]:
# load train and test data
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

# shuffle and reindex
for df in [train_df, test_df]:    
    df = df.sample(frac = 1).reset_index(drop = True)

# print basic properties
print(f"Training Data has {train_df.shape[0]} rows and {train_df.shape[1]} columns: ({list(train_df.columns)})")
print(f"Testing Data has {test_df.shape[0]} rows and {test_df.shape[1]} columns: ({list(test_df.columns)})")

## Exploratory Data Analysis (EDA)

### Target feature

- This feature breaks down into two **unbalanced** categories (use *accuracy* metric with caution):
    - 1 (4358 records, 56%)
    - 0 (3255 records, 44%)
- It is suggested from [here](https://www.kaggle.com/wrrosa/keras-bert-using-tfhub-modified-train-data) that some targets have not been correctly categorized and must be fixed.

In [3]:
# fix mis-classified targets
error_ids = [328, 443, 513, 2619, 3640, 3900, 4342, 5781, 6552, 6554, 6570, 6701, 6702, 6729, 6861, 7226]
train_df.loc[error_ids, 'target'] = 0

# plot categories frequencies
train_df['target'].plot(kind = 'hist', title = 'Target categories frequencies')
plt.show()

### Keyword feature
Keywords, where does this come from? How is it constructed? We don't know the answer to these two vital questions, but it seems like a good feature to work on.
- There are 61 (<1%) missing entries on this column.
- Spaces have been replaced by *%20* and must be fixed.
- There are only 221 unique values out of 7491 the entries (about ~3%).
- Duplicate entries with different formats (e.g. wrecked, wrekage and wreck) that must be fixed.
- Some keywords (like *wound*, *volcano*, *violent storm* and ...) are all related to disaster tweets, this feature can prove useful in our final prediction/validation.

In [4]:
# count null entries
null_entries = train_df.loc[train_df['keyword'].isnull()]
null_entries_count = len(null_entries)
print(f'Number of missing entries: {null_entries_count} (<{round(null_entries_count / train_df.shape[0] * 100)}%)')

# count unique entries
unique_locations = train_df['keyword'].unique()
unique_locations_count = len(unique_locations)
print(f'Number of unique keywords: {unique_locations_count} (~{round(unique_locations_count/train_df.shape[0]*100)}%)')

In [5]:
# Two preprocessing steps must be done on the keyword column before further investigations:
#    3. Manually fix a few column entries: wildfire
#    1. Lemmatize to remove uniform all keywords
#    2. Fix spacing formats

# lemmatize text
def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

for df in [train_df, test_df]:
    keyword_not_na = df['keyword'].notnull()
    
    # Fix column entries
    df.loc[df['keyword'] == 'wildfire', 'keyword'] = 'wild fire'
    
    # Fix space encoding    
    df.loc[keyword_not_na, 'keyword'] = df.loc[keyword_not_na, 'keyword'].apply(lambda x: x.replace('%20', ' '))

    # Lemmatize
    df.loc[keyword_not_na, 'keyword'] = df.loc[keyword_not_na, 'keyword'].apply(lemmatize_text)

In [6]:
# create and count aggregation on keywords and targets, sort and select top 25 keywords
n_keywords = 25

kt_agg = train_df[['keyword', 'target', 'text']].groupby(
    by = ['keyword', 'target'],
    as_index = False
).count().sort_values(
    by = ['keyword', 'target', 'text'],
    ascending = False
).reset_index(drop = True).iloc[:n_keywords * 2]

# prepare data to be plot on stacked-bar plot
kt_agg_X, kt_agg_Y0, kt_agg_Y1 = [], [], []
for i in range(kt_agg.shape[0] - 1):
    
    row_1 = kt_agg.iloc[i]
    row_0 = kt_agg.iloc[i + 1]
    
    kt_agg_X.append(row_0['keyword'])
    
    if row_0['keyword'] == row_1['keyword']:
        kt_agg_Y0.append(row_0['text'])
        kt_agg_Y1.append(row_1['text'])
    elif row_0['target'] == 1:
        kt_agg_Y0.append(row_0['text'])
        kt_agg_Y1.append(0)
    else:
        kt_agg_Y0.append(0)
        kt_agg_Y1.append(row_1['text'])

# plot the stack-bar
fig = plt.figure(figsize = (20, 5))
plt.bar(x = kt_agg_X, height = kt_agg_Y0, color = 'b', label = 'Non-Disaster')
plt.bar(x = kt_agg_X, height = kt_agg_Y1, color = 'g', label = 'Disaster')
plt.title("Keywords/Disastrous state count")
plt.xticks(rotation = 20)
plt.legend()
plt.show()

### Location feature

I honestly don't like this feature, because I don't know where it comes from and how it is constructed. Is it the location of the user making the tweet? If so, then it won't be of any help. (Imagine you are in USA and tweeting about a disaster in Germany!)

Lmao; it also has all kinds of weird entries in it which makes matters worse, for instance: *Live On Webcam*, *milky way*, *Twitter Lockout in progress*, *Your Sister's Bedroom*.

In [7]:
for df in [train_df, test_df]:
    df.drop(columns = ['location'], inplace = True)

### Text feature
This is the most important feature of the dataset. Having a quick look at it, I figured:
- Extracting **hashtags** sounds like a good idea.
- **mentions** must be removed as usernames aren't necessary meaningful.
- **Links** can be scraped. This can be a potentially a good source of extra information.
- Removing **Emojies** and **non-ASCII characters** are helpful.
- Words with too few characters must be removed (careful with this one)
- **HTML Tags** must be removed.
- Fix **abbreviations** in the text. (dictionary of abbreviations taken from [here](https://www.kaggle.com/rftexas/text-only-kfold-bert))

Alternative preprocessings:
- **Lemmatize** all words (reduces dimensionality of the data)
- Remove all **stopwords** (also reduces dimensionality)
- Remove **punctuation** (depends on the model you are using)

NOTE: What preprocessings you apply on you model purely depend on the model you are developing. Whether Dense, LSTM or Embedding Layers are used preprocessing methods should also change accordingly.

In [8]:
# Extract hashtags in form of #<str> (They must have atleast 3 characters)
def extract_hashtags(text):
    matches = re.findall(r'#\w*', text)
    return [match[1:].lower() for match in matches if len(match) > 2] if matches != [] else np.nan

# Extract links
def extract_links(text):
    matches = re.findall(r'\bhttps?://\S+', text)
    return matches if matches != [] else np.nan

for df in [train_df, test_df]:
    # Feature extract #hashtags, @mentions and :links
    df['hashtags'] = df['text'].apply(extract_hashtags)
    df['links'] = df['text'].apply(extract_links)    

In [9]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    'hwy': 'highway',
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    'w/e': 'whatever',
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

# Change an abbreviation to its true word
def fix_abbrev(text):
    return ' '.join([abbreviations[word.lower()] if (word.lower() in abbreviations.keys()) else word for word in text.split()])

# Replace some others smiley face with SADFACE
def transcription_sad(text):
    smiley = re.compile(r'[8:=;][\'\-]?[(\\/]')
    return smiley.sub(r'sad', text)

# Replace <3 with HEART
def transcription_heart(text):
    heart = re.compile(r'<3')
    return heart.sub(r'love', text)

# Replace URLs
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

# Remove HTML
def remove_html(text):
    return re.sub(r'<.*?>', '', text)

# Converts text to lowercase
def to_lower(text):
    return text.lower()

# Remove words contaning numbers
def remove_numbers(text):
    return re.sub(r'\w*\d\w*', '', text)

# Remove text in brackets
def remove_brackets(text):
    return re.sub(r'\[.*?\]', '', text)  

# Replace mentions
def remove_mentions(text):
    return re.sub(r'@\w*', '', text)

# Remove hashtags
def remove_hashtags(text):
    return re.sub(r'#\w*', '', text)

# Remove emojis
def remove_emojis(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
    "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# Remove non-ASCII characters
def remove_non_ascii(text):
#     return ''.join(filter(lambda x: x in string.printable, text))
    return text.encode("ascii",errors="ignore").decode()

# Remove stopwords
def remove_stopwords(text):
    return ' '.join([token.text for token in nlp(text) if not token.is_stop])

# Remove punctuation
def remove_punctuation(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Remove white space (Extra step, just in case)
def remove_whitespace(text):
    return ' '.join(text.split())

# Filter out words with too few characters (2 by default)
def filter_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

In [10]:
for df in [train_df, test_df]:
    
    text = df['text']
    
    # Convert to lowercase
    text = text.apply(to_lower)

    # Replace symbols
    text = text.replace(r'&amp;?', r'and')
    text = text.replace(r'&lt;', r'<')
    text = text.replace(r'&gt;', r'>')
    text = text.replace('&amp;', " and ")
    
    # Manual Lemmatize
    text = text.str.replace('won\'t', 'will not')
    text = text.str.replace('can\'t', 'cannot')
    text = text.str.replace('i\'m', 'i am')
    text = text.replace('ain\'t', 'is not')
    
    # Remove mentions and links (hashtags too?)
#     text = text.apply(remove_hashtags)
    text = text.apply(remove_mentions)
    text = text.apply(remove_urls)

    # Fix abbreviations
    text = text.apply(fix_abbrev)
    
    # Remove HTML tags
    text = text.apply(remove_html)
    
    # Remove texts within brackets
#     text = text.apply(remove_brackets)

    # Fix emojies
    text = text.apply(transcription_sad)   # Sad emojies
    text = text.apply(transcription_heart) # Heart emoji
    text = text.apply(remove_emojis)       # General emojies

    # Remove non-ASCII characters
    text = text.apply(remove_non_ascii)
    
    # Remove words contaning numbers
    text = text.apply(remove_numbers)
    
    # Remove stopwords
#     text = text.apply(remove_stopwords)
    
    # Remove punctuations
    text = text.apply(remove_punctuation)

    # Lemmatize text
#     text = text.apply(lemmatize_text)

    # Remove words with few characters    
#     text = text.apply(filter_words)
    
    # Fill entry if turns out empty
    text = text.apply(lambda x: x if x != '' else '?')

    df['clean_text'] = text

In [11]:
fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (20, 7))

# Disaster Tweets
word_cloud = WordCloud()
word_cloud.generate(" ".join(train_df.loc[train_df['target'] == 1, 'clean_text']))
axs[0].imshow(word_cloud)
axs[0].set_title("Disaster Tweets (Clean text)")
axs[0].axis('off')

# Normal Tweets
word_cloud = WordCloud()
word_cloud.generate(" ".join(train_df.loc[train_df['target'] == 0, 'clean_text']))
axs[1].set_title("Non-Disaster Tweets (Clean text)")
axs[1].imshow(word_cloud)
axs[1].axis('off')

plt.show()

## Develop the Model

In [13]:
# encode data for BERT and create attention masks

TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

def get_BERT_encode(data, maximum_len) :
    input_ids = []
    attention_masks = []
  
    for i in range(len(data['clean_text'])):
        encoded = TOKENIZER.encode_plus(data['clean_text'][i],
                                        add_special_tokens = True,
                                        max_length = maximum_len,
                                        pad_to_max_length = True,
                                        return_attention_mask = True)
      
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids), np.array(attention_masks)

train_input_ids, train_attention_masks = get_BERT_encode(train_df, 60)
test_input_ids, test_attention_masks = get_BERT_encode(test_df, 60)

train_X = [train_input_ids, train_attention_masks]
test_X = [test_input_ids, test_attention_masks]

In [14]:
# define model architecture

bert_base = TFBertModel.from_pretrained('bert-base-uncased')

def get_BERT_model(model_layer, learning_rate):
    
    input_ids = Input(shape = (60,), dtype = 'int32')
    attention_masks = Input(shape = (60,), dtype = 'int32')    

    output = model_layer([input_ids, attention_masks])[1]
#     output = tf.keras.layers.Dense(64, activation = 'relu')(output)
    output = Dropout(0.2)(output)    
    output = Dense(units = 1, activation = 'sigmoid')(output)
    
    model = Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr = learning_rate), loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

model = get_BERT_model(bert_base, learning_rate = 1e-5)
model.summary()

### Training & Evaluation

In [None]:
# define training parameters
epochs = 5
batch_size = 16
validation_split = 0.1

# define training callbacks
checkpoint = ModelCheckpoint(
    filepath = 'model.h5',
    verbose = 1,
    save_best_only = True,
    save_weights_only = True
)
reduce_lr = ReduceLROnPlateau(
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
early_stopping = EarlyStopping(patience = 3)

# train the model
history = model.fit(
    x = train_X,
    y = train_df['target'],
    epochs = epochs,
    batch_size = batch_size,
    validation_split = validation_split,
    callbacks = [reduce_lr, checkpoint, early_stopping]  # All callbacks monitor val_loss by default
)

In [None]:
# plot all the metrics used for training the model
fig, axs = plt.subplots(1, len(METRICS), figsize = (18, 5))

for i, metric in enumerate(METRICS):
    axs[i].set_title('Analysis of ' + metric)
    axs[i].plot(range(EPOCHS), history.history[metric], label = metric)
    axs[i].plot(range(EPOCHS), history.history[f'val_' + metric], label = f'val_' + metric)
    axs[i].legend()

### Making a Prediction
The final model used for prediction is the one with best *val_loss* metric. (The selection is taken care of by **ModelCheckpoint**).

In [None]:
# load best resulting model
model.load_weights('model.h5')

# make predictions
pred = model.predict(test_X, verbose = 2)
pred = np.round(pred).astype(int).reshape(pred.shape[0])

# create a submission
submission = pd.DataFrame({'id': test_df['id'], 'target': pred})
submission.to_csv(f'./submission.csv', index = False)