In [1]:
# https://github.com/mrdbourke/tensorflow-deep-learning/tree/main/docs

In [2]:
## Check gpu
!nvidia-smi

Fri Nov 17 02:47:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py


# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys


--2023-11-17 02:47:06--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2023-11-17 02:47:06 (19.3 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Get a text dataset

In [4]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2023-11-17 02:47:10--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 142.250.101.207, 2607:f8b0:4023:c0d::cf
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-11-17 02:47:11 (129 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



## Become one with the data

In [5]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [10]:
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[['text', 'target']][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real distater)" if target >0 else "(not real disaster)")
  print(f"Text: \n{text}\n")
  print('---\n')


Target: 0 (not real disaster)
Text: 
Never fear quarrels but seek hazardous adventures. https://t.co/dlvZaay7qr

---

Target: 0 (not real disaster)
Text: 
#LOL Plymouth (Û÷LetÛªs Obliterate LitterÛª) http://t.co/GDrssjbH8q

---

Target: 0 (not real disaster)
Text: 
[55436] 1950 LIONEL TRAINS SMOKE LOCOMOTIVES WITH MAGNE-TRACTION INSTRUCTIONS http://t.co/xEZBs3sq0y http://t.co/C2x0QoKGlY

---

Target: 0 (not real disaster)
Text: 
Nike Golf Storm Fit Golf Jacket Black Medium http://t.co/jvAI5Vkmsy: #SportingGoods http://t.co/Nr8JjmpmoS

---

Target: 0 (not real disaster)
Text: 
LONER DIARIES.

The patterns  on the sand
May have been blown away.
The photos in twos
All choked up in flames.... http://t.co/EKfaZ6wVBz

---



## Split data into training and validation sets

In [11]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
  train_df_shuffled['text'].to_numpy(), train_df_shuffled['target'].to_numpy(), test_size=0.1, random_state=42)

In [12]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

## Tokenization

In [13]:
# Check the first ten samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

In [14]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding

# Use the default textvectorization parameters
text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=None,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',

)

In [16]:
# Find the average number of tokens (words) in the training tweets
training_token_avg = round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

In [17]:
# Setup text vectorizan variables
max_vocab_length = 10000
max_length = training_token_avg

text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode='int',
    output_sequence_length=max_length
)

In [18]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(
    train_sentences
)

In [19]:
sample_sentence = "There is a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [20]:
# Choose a random sentence from the training dataset and tokenize it

random_sentence = random.choice(train_sentences)
print(f"Original text: \n {random_sentence} \n\n \
Vectorized version: ")
text_vectorizer([random_sentence])

Original text: 
 MRW when a sinkhole opens up beneath my friends and I... #gif #funny #lol #comedy #iFunny #video #image #RT http://t.co/XiYdYfptru 

 Vectorized version: 


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1,   45,    3,  364, 1624,   27,    1,   13,  819,    7,    8,
        5570, 1136,  174, 3986]])>

In [21]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5: {top_5_words}")
print(f"Bottom 5: {bottom_5_words}")

Number of words in vocab: 10000
Top 5: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### creating an embedding using an Embedding Layer

In [24]:

embedding = Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length) # how long is each input

embedding

<keras.src.layers.core.embedding.Embedding at 0x7f2122a0f910>

In [25]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text: \n {random_sentence}\
          \n\n Embedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text: 
 Some of worst radiation exposure from Fukushima meltdown happened 47km northwest-Proof that small emergency planning zones donÛªt cut it          

 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 3.4849253e-02, -4.0757071e-02,  1.4261197e-02, ...,
         -1.7492879e-02,  2.6227985e-02, -1.4378130e-02],
        [ 5.1785260e-05, -4.6701059e-03,  3.6760416e-02, ...,
          3.8634453e-02, -3.0010952e-02,  1.0064542e-02],
        [ 6.9993958e-03, -4.0755153e-02,  2.2005923e-03, ...,
          3.0218076e-02, -1.4584769e-02, -2.0024372e-02],
        ...,
        [ 3.4774255e-02, -7.5443760e-03,  3.7321974e-02, ...,
          5.2540377e-04,  9.7869746e-03,  1.5287880e-02],
        [ 2.6714716e-02,  9.1190115e-03, -2.5045622e-02, ...,
         -4.7566772e-02,  4.3092798e-02, -2.7271509e-02],
        [ 4.5393456e-02,  1.9005384e-02,  4.4775162e-02, ...,
         -4.6203732e-03, -3.4973286e-02, -2.3632217e-02]]], dtype=float32)>

In [26]:
# Checkout a single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.03484925, -0.04075707,  0.0142612 , -0.04868471,  0.03563447,
         0.00267484,  0.01346276,  0.01568936, -0.00301464,  0.03411651,
        -0.02276166, -0.03829346, -0.02908015, -0.02962776, -0.00088383,
        -0.02129859, -0.03389311,  0.01712794,  0.02765727, -0.03951589,
         0.04756482,  0.02325528, -0.04268385,  0.01662994,  0.02172916,
         0.02804493, -0.04004539, -0.02358156,  0.01987705, -0.03141383,
         0.03319235, -0.0024281 , -0.03913606,  0.04075534, -0.03689966,
        -0.01920694, -0.04231385, -0.01950368, -0.03576493, -0.00593852,
        -0.01802353,  0.03249795,  0.04702231, -0.04099902,  0.0376975 ,
         0.03826046,  0.01209854, -0.01334764,  0.04051806,  0.03436034,
        -0.04082612, -0.01436759, -0.03349887, -0.04157265,  0.0135431 ,
         0.01106085, -0.03271532,  0.0037046 , -0.00078304,  0.03863107,
        -0.02437531, -0.01175293, -0.00538126,  0.03408292,  0.04585366,
  

# Modelling a text dataset (running a series of experiments)

### Create model 0: Getting a baseline

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [28]:
# Evaluate baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [29]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [30]:
# Accuracy
from sklearn.metrics import accuracy_score
sklearn_accuracy = accuracy_score(val_labels,baseline_preds )
sklearn_accuracy

0.7926509186351706

In [31]:
# Precision
from sklearn.metrics import precision_score

sklearn_precision = precision_score(val_labels, baseline_preds)
sklearn_precision

0.8861788617886179

In [32]:
# Confusion matrix
# Note: The following confusion matrix code is a remix of Scikit-Learn's
# plot_confusion_matrix function - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Our function needs a different name to sklearn's plot_confusion_matrix
def make_confusion_matrix(y_true, y_pred, classes=None, figsize=(10, 10), text_size=15, norm=False, savefig=False):
  """Makes a labelled confusion matrix comparing predictions and ground truth labels.

  If classes is passed, confusion matrix will be labelled, if not, integer class values
  will be used.

  Args:
    y_true: Array of truth labels (must be same shape as y_pred).
    y_pred: Array of predicted labels (must be same shape as y_true).
    classes: Array of class labels (e.g. string form). If `None`, integer labels are used.
    figsize: Size of output figure (default=(10, 10)).
    text_size: Size of output figure text (default=15).
    norm: normalize values or not (default=False).
    savefig: save confusion matrix to file (default=False).

  Returns:
    A labelled confusion matrix plot comparing y_true and y_pred.

  Example usage:
    make_confusion_matrix(y_true=test_labels, # ground truth test labels
                          y_pred=y_preds, # predicted labels
                          classes=class_names, # array of class label names
                          figsize=(15, 15),
                          text_size=10)
  """
  # Create the confustion matrix
  cm = confusion_matrix(y_true, y_pred)
  cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] # normalize it
  n_classes = cm.shape[0] # find the number of classes we're dealing with

  # Plot the figure and make it pretty
  fig, ax = plt.subplots(figsize=figsize)
  cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
  fig.colorbar(cax)

  # Are there a list of classes?
  if classes:
    labels = classes
  else:
    labels = np.arange(cm.shape[0])

  # Label the axes
  ax.set(title="Confusion Matrix",
         xlabel="Predicted label",
         ylabel="True label",
         xticks=np.arange(n_classes), # create enough axis slots for each class
         yticks=np.arange(n_classes),
         xticklabels=labels, # axes will labeled with class names (if they exist) or ints
         yticklabels=labels)

  # Make x-axis labels appear on bottom
  ax.xaxis.set_label_position("bottom")
  ax.xaxis.tick_bottom()

  ### Added: Rotate xticks for readability & increase font size (required due to such a large confusion matrix)
  plt.xticks(rotation=70, fontsize=text_size)
  plt.yticks(fontsize=text_size)

  # Set the threshold for different colors
  threshold = (cm.max() + cm.min()) / 2.

  # Plot the text on each cell
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    if norm:
      plt.text(j, i, f"{cm[i, j]} ({cm_norm[i, j]*100:.1f}%)",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)
    else:
      plt.text(j, i, f"{cm[i, j]}",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)

  # Save the figure to the current working directory
  if savefig:
    fig.savefig("confusion_matrix.png")


In [33]:
from sklearn.metrics import confusion_matrix

sklearn_cm = confusion_matrix(val_labels, baseline_preds)
sklearn_cm

array([[386,  28],
       [130, 218]])

In [34]:
# f1-score
from sklearn.metrics import f1_score

sklearn_f1_score = f1_score(val_labels, baseline_preds)
sklearn_f1_score


0.734006734006734

In [35]:
from sklearn.metrics import classification_report
classification_report_dict = classification_report(val_labels, baseline_preds, output_dict=True)
classification_report_dict

{'0': {'precision': 0.748062015503876,
  'recall': 0.9323671497584541,
  'f1-score': 0.8301075268817204,
  'support': 414},
 '1': {'precision': 0.8861788617886179,
  'recall': 0.6264367816091954,
  'f1-score': 0.734006734006734,
  'support': 348},
 'accuracy': 0.7926509186351706,
 'macro avg': {'precision': 0.817120438646247,
  'recall': 0.7794019656838247,
  'f1-score': 0.7820571304442272,
  'support': 762},
 'weighted avg': {'precision': 0.8111390004213173,
  'recall': 0.7926509186351706,
  'f1-score': 0.7862189758049549,
  'support': 762}}

In [36]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
precision_recall = precision_recall_fscore_support(val_labels, baseline_preds)

In [37]:
def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(val_labels,baseline_preds)

  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(val_labels, baseline_preds, average='weighted')

  model_results = {
      'accuracy': model_accuracy,
      'precision': model_precision,
      'recall': model_recall,
      'f1-score': model_f1,
  }

  return model_results

baseline_results = calculate_results(val_labels, baseline_preds)
baseline_results


{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

### Model 1: Simple dense model

In [38]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [39]:
# COmpile model
model_1.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [40]:
len(train_sentences.shape),len(train_labels.shape),len(val_sentences.shape),len(val_labels.shape),

(1, 1, 1, 1)

In [41]:
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
# Check the results
model_1.evaluate(val_sentences, val_labels)




[0.47903451323509216, 0.7847769260406494]

In [43]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape



(762, 1)

In [44]:
model_1_pred_probs[:10]

array([[0.37911686],
       [0.69799954],
       [0.997617  ],
       [0.11728975],
       [0.11430395],
       [0.9401374 ],
       [0.9308535 ],
       [0.9934482 ],
       [0.96822965],
       [0.28441942]], dtype=float32)

In [45]:
val_labels[:10]

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0])

In [46]:
# Convert modelpredictions probs to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs),axis=1)
model_1_preds.shape

TensorShape([762])

In [47]:
model_1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [48]:
# Calculate our model_1 results
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

In [49]:
baseline_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

In [50]:
np.array(list(model_1_results.values())) == np.array(list(baseline_results.values()))

array([ True,  True,  True,  True])

## Visualizing learned embeddings

In [51]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [52]:
# Model 1 summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

https://projector.tensorflow.org/

In [54]:
# Get the weight matrix of embedding layer
# yhere are numerical represnetations of each token in our training data

embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape

(10000, 128)

In [55]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [56]:
# try:
#   from google.colab import files
#   files.download('vectors.tsv')
#   files.download('metadata.tsv')
# except Exception:
#   pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Model 2: LSTM
LSTM: Long short term memory

In [73]:
# Create an LSTM model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64, return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_2 = tf.keras.Model(inputs, outputs, name='model_2_LSTM')



(None, 15, 128)
(None, 15, 64)
(None, 64)


In [74]:
# Get a summary
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_6 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_7 (LSTM)               (None, 64)                33024     
                                                                 
 dense_7 (Dense)             (None, 64)                4160      
                                                      

In [75]:
model_2.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [77]:
model_2_history = model_2.fit(
    train_sentences,
    train_labels,
    epochs=10,
    validation_data=(val_sentences, val_labels),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [78]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[3.0672278e-02],
       [8.2979441e-01],
       [9.9999917e-01],
       [1.1179952e-01],
       [1.8060395e-05],
       [9.9999607e-01],
       [9.9980778e-01],
       [9.9999928e-01],
       [9.9999905e-01],
       [9.9998617e-01]], dtype=float32)

In [79]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [80]:
# Calculate model 2 results
model_2_results = calculate_results(val_labels,model_2_preds )
model_2_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

In [81]:
baseline_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

### GRU model

In [104]:
# Create an GRU model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64, return_sequences=True)(x)
x = layers.LSTM(42, return_sequences=True)(x)
x = layers.GRU(64)(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_3 = tf.keras.Model(inputs, outputs, name='model_3_GRU')

In [105]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_9 (GRU)                 (None, 15, 64)            37248     
                                                                 
 lstm_10 (LSTM)              (None, 15, 42)            17976     
                                                                 
 gru_10 (GRU)                (None, 64)                20736     
                                                       

In [95]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_4 (GRU)                 (None, 15, 64)            37248     
                                                                 
 lstm_8 (LSTM)               (None, 15, 42)            17976     
                                                                 
 gru_5 (GRU)                 (None, 64)                20736     
                                                       

In [96]:
model_3.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [97]:
model_3_history = model_3.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [91]:
model_3_preds_probs = model_3.predict(val_sentences)
model_3_preds_probs[:10]



array([[8.3834060e-02],
       [6.8580222e-01],
       [9.9988472e-01],
       [3.9968181e-01],
       [1.2108008e-05],
       [9.9969137e-01],
       [9.9500316e-01],
       [9.9986923e-01],
       [9.9978799e-01],
       [9.5536071e-01]], dtype=float32)

In [92]:
model_3_preds = tf.squeeze(tf.round(model_3_preds_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [93]:
model_3_results = calculate_results(val_labels, model_3_preds)
model_3_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

### Model 4: Bidirectional

In [135]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(32))(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_4 = tf.keras.Model(inputs, outputs, name='model_4_bidirectional')

In [136]:
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_28 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional_21 (Bidirect  (None, 15, 128)           98816     
 ional)                                                          
                                                                 
 bidirectional_22 (Bidirect  (None, 15, 128)           74496     
 ional)                                                          
                                             

In [139]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=4,

)

In [137]:
model_4.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [140]:
model_4_history = model_4.fit(
    train_sentences,
    train_labels,
    epochs=50,
    validation_data=(val_sentences, val_labels),
    callbacks=[early_stopping_callback]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [141]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[1.9924250e-02],
       [6.8032253e-01],
       [9.9990869e-01],
       [1.3796241e-02],
       [5.4374283e-05],
       [9.9988568e-01],
       [9.9876320e-01],
       [9.9992466e-01],
       [9.9992037e-01],
       [9.9971467e-01]], dtype=float32)

In [142]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [143]:
model_4_results = calculate_results(val_labels, model_4_preds)

In [144]:
model_4_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}

In [145]:
model_3_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1-score': 0.7862189758049549}