<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/gab_and_reddit_glove300_sigmoid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Init environment

In [5]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
# How I got Glove6B embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!ls
#!unzip glove.6B.zip
#!mkdir drive/MyDrive/nlp/models-pre/glove6B
#!mv *d.txt drive/MyDrive/nlp/models-pre/glove6B

In [38]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
sys.path.append('drive/MyDrive/nlp/nlp-offensive-language/src')


from preprocessing import preprocess



Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)
Collecting tensorflow-addons
[?25l  Downloading https://files.pythonhosted.org/packages/74/e3/56d2fe76f0bb7c88ed9b2a6a557e25e83e252aec08f13de34369cd850a0b/tensorflow_addons-0.12.1-cp37-cp37m-manylinux2010_x86_64.whl (703kB)
[K     |████████████████████████████████| 706kB 8.2MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.12.1


In [9]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"

GLOVE_DIR = "drive/MyDrive/nlp/models-pre/glove6B/"

In [81]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

#filename = "english/embeddia/dataset.csv"
filename = "english/gab_and_reddit/dataset.csv"
csv_read = csv.reader(open(os.path.join(DATASETS_DIR, filename), encoding="utf8"), delimiter=",")

texts = []
labels = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_BASE_PUNCTUATIONS, PP_REMOVE_NUMBERS])
  text = " ".join(text)

  texts.append(text)
  if line[2] == "OFF":
    labels.append([1])
  else:
    labels.append([0])

print("Loaded dataset")
print(str(len(texts)) + " texts")
print("First: " + texts[0])
print(labels[0])

Loaded dataset
56085 texts
First: i joined gab to remind myself how retarded jew haters are you would not be typing on your abacus without them you retard
[1]


In [82]:
import os
import numpy as np
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Prepare tokenizer given loaded texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

vocab_size = len(tokenizer.word_index)+1

# Load Glove model
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
pad_seq = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH)
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Create embedding matrix

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Create embedding layer

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Created embedding layer

Found 400000 word vectors.


Train model

In [83]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,Bidirectional
from keras.metrics import Precision, Recall
from tensorflow_addons.metrics import F1Score


train_index = int(len(pad_seq) * 0.8)

train_seq = np.array(pad_seq[:train_index])
train_labels = np.array(labels[:train_index])

test_seq = np.array(pad_seq[train_index:])
test_labels = np.array(labels[train_index:])

model = Sequential()
model.add(embedding_layer)
model.add(Dense(128,activation = 'relu'))
model.add(LSTM(128))
model.add(Dense(64,activation = 'relu'))
model.add(Dense(16,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics = [F1Score(num_classes=1, threshold=0.5), 'accuracy'])

model.fit(train_seq, train_labels, epochs=10, validation_split=0.1, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff3b00f2550>

Evaluate model

In [84]:
# Evaluate
hm = model.evaluate(test_seq, test_labels, verbose=0, return_dict=True)
print(hm)
#print('Test accuracy:', hm['accuracy'])

TP=0
TN=0
FP=0
FN=0
total=0

predictions = (model.predict(test_seq) > 0.5).astype("int32")
for i in range(len(test_seq)):
  predicted_class = predictions[i][0]
  actual_class = test_labels[i][0]

  if actual_class == 1:
    if predicted_class == 1:
      TP += 1
    else:
      FN += 1
  else:
    if predicted_class == 0:
      TN += 1
    else:
      FP += 1
  
  total += 1

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print(precision)
print(recall)
print(f1)
print(str((TP + TN) / total))

print(TP)
print(FP)
print(FN)
print(TN)


{'loss': 0.4148491322994232, 'f1_score': array([0.7529163], dtype=float32), 'accuracy': 0.8753677606582642}
0.7615302109402932
0.7444949318420133
0.7529162248144221
0.875367745386467
2130
667
731
7689


3
