<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/lstm_glove300_sigmoid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Init environment

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# How I got Glove6B embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!ls
#!unzip glove.6B.zip
#!mkdir drive/MyDrive/nlp/models-pre/glove6B
#!mv *d.txt drive/MyDrive/nlp/models-pre/glove6B

In [4]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
sys.path.append('drive/MyDrive/nlp/nlp-offensive-language/src')


from preprocessing import preprocess



Collecting lemmagen3
[?25l  Downloading https://files.pythonhosted.org/packages/4d/80/b0d1f328a512fb54aa120f491f14ebba18add825908b56c3c7da7a1fe542/lemmagen3-3.3.1-cp37-cp37m-manylinux2010_x86_64.whl (12.4MB)
[K     |████████████████████████████████| 12.4MB 243kB/s 
[?25hCollecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 39.6MB/s 
[?25hCollecting pybind11>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/8d/43/7339dbabbc2793718d59703aace4166f53c29ee1c202f6ff5bf8a26c4d91/pybind11-2.6.2-py2.py3-none-any.whl (191kB)
[K     |████████████████████████████████| 194kB 41.3MB/s 
[?25hInstalling collected packages: pybind11, lemmagen3, emoji
Successfully installed emoji-1.2.0 lemmagen3-3.3.1 pybind11-2.6.2
Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)
Collect

In [5]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"

GLOVE_DIR = "drive/MyDrive/nlp/models-pre/glove6B/"

In [48]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

filename = "english/embeddia/dataset.csv"
#filename = "english/gab_and_reddit/dataset.csv"
csv_read = csv.reader(open(os.path.join(DATASETS_DIR, filename), encoding="utf8"), delimiter=",")

texts = []
labels = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_BASE_PUNCTUATIONS, PP_REMOVE_NUMBERS])
  text = " ".join(text)

  texts.append(text)
  if line[2] == "OFF":
    labels.append([1])
  else:
    labels.append([0])

print("Loaded dataset")
print(str(len(texts)) + " texts")
print("First: " + texts[0])
print(labels[0])

Loaded dataset
14100 texts
First: whoisq wherestheserver dumpnike declasfisa democrats support antifa muslim brotherhood ms isis pedophilia child trafficking taxpayer funded abortion ’ s election fraud sedition and treason lockthemallup wwgwga qanon ⁦ url
[1]


In [49]:
import os
import numpy as np
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Prepare tokenizer given loaded texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

vocab_size = len(tokenizer.word_index)+1

# Load Glove model
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
pad_seq = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH)
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Create embedding matrix

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Create embedding layer

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Created embedding layer

Found 400000 word vectors.


Train model

In [52]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,Bidirectional
from keras.metrics import Precision, Recall
from keras.callbacks import EarlyStopping
from tensorflow_addons.metrics import F1Score


train_index = int(len(pad_seq) * 0.9)

train_seq = np.array(pad_seq[:train_index])
train_labels = np.array(labels[:train_index])

test_seq = np.array(pad_seq[train_index:])
test_labels = np.array(labels[train_index:])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

model = Sequential()
model.add(embedding_layer)
model.add(Dense(128,activation = 'relu'))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(32,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics = [F1Score(num_classes=1, threshold=0.5), 'accuracy'])

model.fit(train_seq, train_labels, epochs=10, validation_split=(1/6), batch_size=128, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f39aa2b7390>

Evaluate model

In [53]:
# Evaluate
hm = model.evaluate(test_seq, test_labels, verbose=0, return_dict=True)
print(hm)
#print('Test accuracy:', hm['accuracy'])

TP=0
TN=0
FP=0
FN=0
total=0

predictions = (model.predict(test_seq) > 0.5).astype("int32")
for i in range(len(test_seq)):
  predicted_class = predictions[i][0]
  actual_class = test_labels[i][0]

  if actual_class == 1:
    if predicted_class == 1:
      TP += 1
    else:
      FN += 1
  else:
    if predicted_class == 0:
      TN += 1
    else:
      FP += 1
  
  total += 1

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1))
print("Accuracy: " + str(str((TP + TN) / total)))


{'loss': 0.48750853538513184, 'f1_score': array([0.56992084], dtype=float32), 'accuracy': 0.7687942981719971}
Precision: 0.7659574468085106
Recall: 0.453781512605042
F1 Score: 0.5699208443271767
Accuracy: 0.7687943262411348


3
