In [28]:
#ignore all warnings
import warnings
warnings.filterwarnings(action='ignore')

#importing libraries
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline as PP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# What we're going to cover:


1. Downloading a text dataset
2. Visualizing text data
3. Converting text into numbers using tokenization
4. Turning our tokenized text into an embedding
5. Modelling a text dataset
6. Starting with a baseline (TF-IDF)
7. Building several deep learning text models
8. Dense, LSTM, GRU, Conv1D, Transfer learning
9. Comparing the performance of each our models
10. Combining our models into an ensemble
11. Saving and loading a trained model
12. Find the most wrong predictions

In [3]:
#unzip data
pd.re

In [3]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_df.shape

(7613, 5)

In [10]:
#we are gonna work on training data so lets's get split it to the training and validation set
train_sentences,val_sentences,train_labels,val_labels = train_test_split(train_df['text'].to_numpy(),
                                                                         train_df['target'].to_numpy(),
                                                                         test_size= 0.2, # 20% of data gonna be used for testing
                                                                         random_state= 42)

# Converting text to numbers
There are 2 ways to turn text to numbers
1. Tokenization
* Word-Level Tokenization
* Character Level Tokenization
* Sub-Word Tokenization
2. Embedding
* Use ( sucs as tf.keras.layers.Embedding) to create new Embedding
* Or use pre-learned Embedding

In [11]:
#let's create text vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=None,
                                                    standardize='lower_and_strip_punctuation',
                                                    split='whitespace',
                                                    ngrams=None,
                                                    output_mode='int',
                                                    output_sequence_length=None,
                                                    pad_to_max_tokens=False)

In [14]:
#fit the text vectorizer to the train sentences
text_vectorizer.adapt(train_sentences)

In [18]:
sample_sentence = train_sentences[0]
sample_sentence, text_vectorizer([sample_sentence])

('Courageous and honest analysis of need to use Atomic Bomb in 1945. #Hiroshima70 Japanese military refused surrender. https://t.co/VhmtyTptGR',
 <tf.Tensor: shape=(1, 18), dtype=int64, numpy=
 array([[17198,     8,  4962,  2917,     6,   156,     5,   438,   216,
           118,     4,  2377,  4973,  1177,   240,  4381,  4119, 14803]],
       dtype=int64)>)

# It is time to build a model and specifically, we'll be building the following:

- Model 0: Naive Bayes (baseline)
- Model 1: Feed-forward neural network (dense model)
- Model 2: LSTM model
- Model 3: GRU model
- Model 4: Bidirectional-LSTM model
- Model 5: 1D Convolutional Neural Network
- Model 6: TensorFlow Hub Pretrained Feature Extractor
- Model 7: Same as model 6 with 10% of training data

# Model 0: Baseline Model

In [24]:
#create the model
model_0 = PP(steps=[
    ('tdf', TfidfVectorizer()), #convert text to numbers using tfidfvectorizer
    ('clf', MultinomialNB())    #model the text
])

#fit the model
model_0.fit(train_sentences,train_labels)

Pipeline(steps=[('tdf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [27]:
#let's look at the score
model_0_score = model_0.score(val_sentences,val_labels)
print(f'The Accuracy Score of Baseline model is {round(model_0_score * 100,2)}%')

The Accuracy Score of Baseline model is 79.97%


In [33]:
model_0_predictions = model_0.predict(val_sentences)

In [56]:
#let's create function to see our model all results (accuracy, presicion,f1, recall)
def calculate_metrics(y_true, y_pred):
    model_results = { 
        'accuracy': round(accuracy_score(y_true,y_pred),2),
        'recall': round(recall_score(y_true,y_pred),2),
        'precision': round(precision_score(y_true,y_pred),2),
        'f1_score': round(f1_score(y_true,y_pred)*100,2),
    }
    return model_results

In [57]:
calculate_metrics(val_labels, model_0_predictions)

{'accuracy': 0.8, 'recall': 0.63, 'precision': 0.86, 'f1_score': 72.84}

# Model 1: Simple Dense Model

In [None]:
#set the some values
max_features = 10000
embedding_dim = 16

#create the model
model_1 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_features + 1, embedding_dim),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.Dense(1)
])

#compile the model
model_1.compile(loss = 'mae',
                optimizer = tf.keras.optimizers.Adam(),
                )