In [None]:
import re                                                       # Regex for string extraction
import spacy                                                    # Spacy for text processing
import string                                                   # For using string functions and properties
import numpy as np                                              # Linear Algebra
import pandas as pd                                             # Pandas for DataFrame
from matplotlib import pyplot as plt                            # Matplotlib for plots
from sklearn.model_selection import train_test_split            # Used to split the data into training, validation and test sets
from wordcloud import WordCloud                                 # Wordclouds

# Keras packages
from keras.preprocessing.text import Tokenizer                  # Simplifies different tokenizing methods
from keras.utils.vis_utils import plot_model                    # For plotting model
from keras.models import Sequential                             # Sequential model
from keras.layers import Dense                                  # Dense layer

nlp = spacy.load('en_core_web_sm')  # Load the English language (Other options: en_core_web_sm, en_core_web_md, en_core_web_lg)

### Load Data
Let's load the data and analyse it's primary properties:

In [None]:
# Load datasets
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

# Combine them for ease of use
combined = [train_df, test_df]

# Take a leak at the data
train_df.head(5)

In [None]:
# Print basic properties
print(f"- Training Data has {train_df.shape[0]} rows and {train_df.shape[1]} columns: ({list(train_df.columns)})")
print(f"- Testing Data has {test_df.shape[0]} rows and {test_df.shape[1]} columns: ({list(test_df.columns)})")

## Exploratory Data Analysis (EDA)
In this step, we analyze different features of our datasets to unidentify abnormalities and their behaviors. We will do the followings:
- Check the distribution of our targets in both train & test data. (If they are not balanced, we might have to downcast our samples, otherwise our model will be biased!)
- Investigate how helpful the **Keyword** feature is and how that can be used in the model.
- Investigate the **Location** feature and check if it is useful for our model.
- Go through the **Text** features and identify problems and possible fixes. Also look for possible features that can be extracted and analyzed later on.

### Target feature distrubution

In [None]:
# Check the distribution of tagets
target_counts = train_df['target'].value_counts()

fig = plt.figure(figsize = (20, 3))
plt.title('Target distribution')
plt.xlabel('Targets')
plt.ylabel('Counts')
plt.xticks([0, 1])
plt.bar(target_counts.keys(), target_counts.values)
plt.show()

Looking at the distribution, I say it's OK. It's not evenly balanced and that can be something we might want to fix by downcasting the data later when optimizing the model. Downcasting may or may not be a good idea as we loose some of our data! (You have to test it to find out its impact on the model)

### Location feature

In [None]:
# NULL entries
null_entries = train_df.loc[train_df['location'].isnull()]
null_entries_count = len(null_entries)
print(f'- There are {null_entries_count} missing entries on "location" column ({round(null_entries_count / train_df.shape[0] * 100, 1)}%)')

# Uniqueness
unique_locations = train_df['location'].unique()
unique_locations_count = len(unique_locations)
print(f'- Number of unique locations: {unique_locations_count} ({round(unique_locations_count/train_df.shape[0]*100, 2)}%)')

In [None]:
# 20 Top Frequencies
location_freq = train_df['location'].value_counts()
top_20_locs = location_freq[:20]

fig = plt.figure(figsize = (20, 3))
plt.title('20 Most used Locations')
plt.bar(top_20_locs.index, top_20_locs.values)
plt.xticks(rotation = 30)
plt.show()

After careful investigation of the **location** column, we can take note that:
- Some countries, although the same, have different naming convensions (eg. USA, United states)
- There are some formatting errors (eg. T E X A S)
- 33% of the locations are missing. That's a third of the data!
- Around 44% of the existing locations in the dataset are unique.

### Keyword feature

In [None]:
# NULL entries
null_entries = train_df.loc[train_df['keyword'].isnull()]
null_entries_count = len(null_entries)
print(f'- There are {null_entries_count} missing entries on "keyword" column ({round(null_entries_count / train_df.shape[0] * 100, 2)}%)')

# Uniqueness
unique_locations = train_df['keyword'].unique()
unique_locations_count = len(unique_locations)
print(f'- Number of unique keyword: {unique_locations_count} ({round(unique_locations_count/train_df.shape[0]*100, 2)}%)')

In [None]:
# 20 Top Frequencies
keyword_freq = train_df['keyword'].value_counts()
keys_25_locs = keyword_freq[:25]

fig = plt.figure(figsize = (20, 3))
plt.title('25 Most used Keywords')
plt.bar(keys_25_locs.index, keys_25_locs.values)
plt.xticks(rotation = 30)
plt.show()

We understand the followings from a quick analysis of the **keywords** feature:
- Most keywords contain space which is represented by *%20* which should be fixed.
- Most keywords indicate distaster signs according to our graph. Good sign here!
- Approximatly 0.8% entries with missing *Keyword* which is also a good sign!
- Only about 3% of the keywords are unique.

### Text feature

For the sake of practice, I am going to extract as much information as I can from the text, this includes extracting links, hashtags and mentions and anything that can be considered useful. Some of the tasks you might want to do:
- There are lot's of #hashtags used that can be helpful, so extract and store them in a list for each tweet.
- There also are some links, however, since links are shortened (encoded), they are useless in their basic form. We could scrape each link for headlines and titles, but I'm going to remove them for sipmplicity.
- @mentions can be seen in some tweets, maybe they point to a certain user or something, extract the username and store it in a list for each tweet for later analysis.
- There are some emojies and None-ASCII characters that are better be removed.
- Numbers can also be removed as they may be a little misleading.
- Some comments are fully written in uppercase, we may want to make them lowercase; Or there are words that don't have to be in upper form, like ALLAH.
- There are comments that are almost identical except their links, we can easily detect and remove the duplicates after extracting links (See text[110:120])
- Maybe finding the common words can be useful for identifing which class does each tweet belong to.
- There are some typos in some texts, they can be fixed with some python packages.
- We could extract emojis!

In [None]:
train_df['text'].iloc[:50].values

#### Feature Extraction

We extract linkes, hashtags and mentions from the tweet texts and store them  in the dataset as lists. By a more careful analysis, it can be shown that links have been shortened. You could scrape each link and search for topics and healines but since this is time consuming and there aren't many links in the dataset, I decided to use a more simple approach and remove them all.

I also found @mentions useless as they are usernames and usernames can be pretty meaningless and made of random characters. Let's not over-complicate the analysis and remove them as well.

But #hashtags are still very useful as they are used to tag topics and can be used to group tweets.

In [None]:
def extract_hashtag(text):
    matches = re.findall(r'#\w*', text)
    return [match[1:].lower() for match in matches] if matches != [] else None

def extract_mention(text):
    matches = re.findall(r'@\w*', text)
    return [match[1:].lower() for match in matches] if matches != [] else None

def extract_links(text):
    matches = re.findall(r'\bhttps?://\S+', text)
    return matches if matches != [] else None

In [None]:
# Create two new features for #hashtags, @mentions and links
for df in combined:
  df['hashtags'] = df['text'].apply(extract_hashtag)
  df['mentions'] = df['text'].apply(extract_mention)
  df['links'] = df['text'].apply(extract_links)

In [None]:
feature_counts = train_df.count()

fig = plt.figure(figsize = (20, 3))
plt.title('Features counts')
plt.xlabel('Feature')
plt.ylabel('Count')
plt.bar(feature_counts.keys(), feature_counts.values)
plt.show()

As you can see, many hastags, mentions and links are missing already. They aren't much beneficial now, sadly.
That brings us to our first place, with no new features to work with. But that's ok because that was necessary to get us to the point where we can be confident in our decisions about removing them from our text data.

#### Wordclouds

In [None]:
# Word cloud for disaster tweets
plt.figure(figsize = (10, 8))
word_cloud = WordCloud(max_font_size = 50).generate(" ".join(train_df.loc[train_df['target'] == 1, 'text'].iloc[:100]))
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

In [None]:
# Word cloud for non-disaster tweets
plt.figure(figsize = (10, 8))
word_cloud = WordCloud(max_font_size = 50).generate(" ".join(train_df.loc[train_df['target'] == 0, 'text'].iloc[:100]))
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

## Data Cleaning

This is a quite important step. After randomly looking at out text data, I found it usefull to do the followings:
- Remove all links
- Remove all HTML tags
- Remove Emojis (Using their unicode)
- Remove all @mentions
- Remove all non-ASCII characters
- Lemmatize all words (This is a very important step, since lemmatization is a way to reduce the number of words in the text)
- Remove all stop words (Most stop words don't provided much information)
- Remove all punctuations (Careful with this one, since it might remove important information that can be used in Embeddings later on...)
- Conver all words to lower case (Danagerous for Embeddings! - "MIT" is a university in US, but "mit" means "with" in germany)
- Fix text case (Some comments are fully uppercase, it's a good practice to convert them into lowercase)

In [None]:
# Remove URLs
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

# Remove HTML
def remove_html(text):
    return re.sub(r'<.*?>', '', text)

# Converts text to lowercase
def to_lower(text):
  return text.lower()

# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Remove mentions
def remove_mentions(text):
    return re.sub(r'@\w*', '', text)

# Remove emojis
def remove_emojis(text):
  regrex_pattern = re.compile(pattern = "["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
    "]+", flags = re.UNICODE)
  return regrex_pattern.sub(r'',text)

# Remove non-ASCII characters
def remove_non_ascii(text):
    return ''.join(filter(lambda x: x in string.printable, text))

# Lemmatize text
def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

# Remove stopwords
def remove_stopwords(text):
    return ' '.join([token.text for token in nlp(text) if not token.is_stop])

# Remove punctuation
def remove_punctuation(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Remove white space (Extra step, just in case)
def remove_whitespace(text):
    return ' '.join(text.split())

def clean_text(text, lower = True,html = True,urls = True, numbers = True, mentions = True, ascii = True,
               punctuations = True, stopwords = True, lemmatize = True):
    ''' Properties to remove when cleaning the given text'''    

    # Clean text
    if lower:
      text = to_lower(text)
    if html:
      text = remove_html(text)
    if urls:
      text = remove_urls(text)
    if numbers:
      text = remove_numbers(text)
    if mentions:
      text = remove_mentions(text)
    if ascii:
      text = remove_non_ascii(text)
    if punctuations:
      text = remove_punctuation(text)
    if stopwords:
      text = remove_stopwords(text)
    if lemmatize:
      text = lemmatize_text(text)

    return remove_whitespace(text)      # Last step just in case

In [None]:
for df in combined:
  df['cleaned_text'] = df['text'].apply(clean_text)

train_df.head()

## Deep Learning Model

In [None]:
# Drop records with no texts from training data, replace the emoty texts for test data with '?'
train_df.dropna(axis = 0, subset = ['cleaned_text'], inplace = True)
test_df['cleaned_text'].fillna('?', inplace = True)

# Shuffle
train_df = train_df.sample(frac = 1).reset_index(drop = True)

train_df.info()

### Encoding

In [None]:
tokenizer = Tokenizer(lower = True, oov_token = '?')
tokenizer.fit_on_texts(train_df['cleaned_text'])

# Modes: 'binary', 'count', 'freq', 'tfidf'
train_encodes = tokenizer.texts_to_matrix(train_df['cleaned_text'], mode = 'count')
test_encodes = tokenizer.texts_to_matrix(test_df['cleaned_text'], mode = 'count')

print('Matrix shape:', train_encodes.shape)

### Building the Model

In [None]:
def build_model(x, loss, optimizer, metrics):
  model = Sequential()
  model.add(Dense(units = 64, input_shape = (x.shape[1],), activation = 'relu'))
  model.add(Dense(units = 1, activation = 'sigmoid'))
  model.compile(loss = loss, optimizer = optimizer, metrics = metrics)  

  return model

### Training the Model

In [None]:
def train_model(model, x, y, epochs, batch_size, validation_split):
  history = model.fit(
    x = x,
    y = y,
    epochs = epochs,
    batch_size = batch_size,
    validation_split = validation_split
  )

  return history.history

### Plotting the Model

In [None]:
def plot_model_training(epochs, history, metrics):    

  def plot_subplot(axs, metric, val_metric):
    ''' Plot a single subplot '''

    axs.set_title('Analysis of ' + metric)
    axs.plot(epochs, history[metric], label = metric)
    axs.plot(epochs, history[val_metric], label = val_metric)
    axs.legend()

  fig, axs = plt.subplots(1, len(metrics), figsize = (18, 5))

  for i, metric in enumerate(metrics):
    plot_subplot(axs[i], metric, 'val_' + metric)

### All in One

In [None]:
X_train = train_encodes
y_train = train_df['target']

# Build model
model = build_model(
  x = X_train,
  loss = 'binary_crossentropy',
  optimizer = 'adam',
  metrics = ['accuracy']
)

# Plot model
plot_model(model = model, show_dtype = True, show_shapes = True, show_layer_names = False)

# Train model
history = train_model(
  model = model,
  x = X_train,
  y = y_train,
  epochs = 1,
  batch_size = 64,
  validation_split = 0.2
)

# Plot training process
epochs = [i for i in range(len(history['loss']))]
plot_model_training(
  epochs = epochs,
  history = history,
  metrics = ['accuracy', 'loss']
)

### Make a Prediction

In [None]:
X_test = test_encodes

pred = model.predict(X_test, verbose = 2)
pred = np.round(pred).astype(int).reshape(pred.shape[0])

# Create submission
submission = pd.DataFrame({'id': test_df['id'].values.tolist(), 'target': pred})
submission.to_csv('./submission.csv', index = False)

In [None]:
kaggle competitions submit -c nlp-getting-started -f submission.csv