<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/lstm_random_sigmoid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Init environment

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Preprocessing init

In [None]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
sys.path.append('drive/MyDrive/nlp/nlp-offensive-language/src')

from preprocessing import preprocess

In [None]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"

MAX_SEQUENCE_LENGTH = 100

Preprocess texts

In [None]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

#filename = "english/fox_news/dataset.csv"
#filename = "english/gab_and_reddit/dataset.csv"
#filename = "english/deep_offense/dataset.csv"
filename = "english/embeddia/dataset.csv"
#filename = "english/trac_2/dataset.csv"
#filename = "english/wiki_detox/dataset_aggression.csv"
#filename = "english/wiki_detox/dataset_attack.csv"
#filename = "english/wiki_detox/dataset_toxicity.csv"
csv_read = csv.reader(open(os.path.join(DATASETS_DIR, filename), encoding="utf8"), delimiter=",")

texts = []
labels = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = text.replace("NEWLINE_TOKEN", "")
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_BASE_PUNCTUATIONS, PP_REMOVE_NUMBERS])
  if (len(text) > MAX_SEQUENCE_LENGTH):
    continue

  text = " ".join(text)

  texts.append(text)
  if "OFF" in line[2]:
    labels.append([1])
  else:
    labels.append([0])

print("Loaded dataset")
print(str(len(texts)) + " texts")
print("First: " + texts[0])

Create embedding layer

In [None]:
import os
import numpy as np
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Prepare tokenizer given loaded texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

vocab_size = len(tokenizer.word_index)+1

# Load Glove model
EMBEDDING_DIM = 100
pad_seq = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index

# Create embedding layer
embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)

# Created embedding layer

Train model

In [None]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,Bidirectional,Flatten
from keras.metrics import Precision, Recall
from keras.callbacks import EarlyStopping
from tensorflow_addons.metrics import F1Score


train_index = int(len(pad_seq) * 0.9)

train_seq = np.array(pad_seq[:train_index])
train_labels = np.array(labels[:train_index])

test_seq = np.array(pad_seq[train_index:])
test_labels = np.array(labels[train_index:])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

model = Sequential()
model.add(embedding_layer)
model.add(Dense(128,activation = 'relu'))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(32,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics = [F1Score(num_classes=1, threshold=0.5), 'accuracy'])

model.fit(train_seq, train_labels, epochs=10, validation_split=(1/6), batch_size=128, callbacks=[es])

Evaluate model

In [None]:
# Evaluate
hm = model.evaluate(test_seq, test_labels, verbose=0, return_dict=True)
print(hm)
#print('Test accuracy:', hm['accuracy'])

TP=0
TN=0
FP=0
FN=0
total=0

predictions = (model.predict(test_seq) > 0.5).astype("int32")
for i in range(len(test_seq)):
  predicted_class = predictions[i][0]
  actual_class = test_labels[i][0]

  if actual_class == 1:
    if predicted_class == 1:
      TP += 1
    else:
      FN += 1
  else:
    if predicted_class == 0:
      TN += 1
    else:
      FP += 1
  
  total += 1

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1))
print("Accuracy: " + str(str((TP + TN) / total)))
