<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/create_multi_lstm_random_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook for building LSTM model using Glove word embeddings.

Init environment

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"
GLOVE_DIR = "drive/MyDrive/nlp/embeddings/glove6B/"
MODELS_DIR = "drive/MyDrive/nlp/models/"
MODEL_NAME = "multi_lstm_glove_en"

#LANGUAGE = "english"
#DATASET = "english/fox_news/dataset.csv"
#DATASET = "english/gab_and_reddit/dataset.csv"
#DATASET = "english/deep_offense/dataset.csv"
#DATASET = "english/trac_2/dataset.csv"
#DATASET = "english/wiki_detox/dataset_aggression.csv"
#DATASET = "english/wiki_detox/dataset_attack.csv"
#DATASET = "english/wiki_detox/dataset_toxicity.csv"
#DATASET = "english/frenk_lgbt/dataset.csv"
#DATASET = "english/frenk_migrants/dataset.csv"
#DATASET = "english/combined_preprocessed/combined_dataset_train.csv"

LANGUAGE = "slovene"
#DATASET = "slovenian/frenk_migrants/dataset.csv"
DATASET = "slovenian/frenk_lgbt/dataset.csv"

# Allow sequences max 100 long
MAX_SEQUENCE_LENGTH = 100

Initialize preprocessing


In [11]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons
!pip install scikit-learn

import sys
sys.path.append('/content/drive/MyDrive/nlp/nlp-offensive-language/src')

from preprocessing import preprocess

Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)


Load and preprocess training dataset

In [12]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

csv_read = csv.reader(open(os.path.join(DATASETS_DIR, DATASET), encoding="utf8"), delimiter=",")

texts = []
labels = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = text.replace("NEWLINE_TOKEN", "")
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_BASE_PUNCTUATIONS], language=LANGUAGE)
  if (len(text) > MAX_SEQUENCE_LENGTH):
    continue

  text = " ".join(text)

  texts.append(text)
  labels.append(line[2])

unique_labels = set(labels)
softmax_size = len(unique_labels)
labels_map = {}
indexed_map = {}
for id, label in enumerate(unique_labels):
  # Create array represenatation for each label
  softmax_array = [0 for i in range(softmax_size)]
  softmax_array[id] = 1

  labels_map[label] = softmax_array
  indexed_map[id] = softmax_array

# Transform labels into arrays
labels = [labels_map[label] for label in labels]

print("Loaded dataset")
print(str(len(texts)) + " texts")
print("First: " + texts[0])

Loaded dataset
3501 texts
First: kako omogoča saj pa oni že lahko sklenejo zvezo


Create embedding layer

In [13]:
import os
import numpy as np
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Prepare tokenizer given loaded texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

vocab_size = len(tokenizer.word_index)+1

EMBEDDING_DIM = 100
pad_seq = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index

# Create embedding layer
embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)

Train model

In [14]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,Bidirectional
from keras.metrics import Precision, Recall
from keras.callbacks import EarlyStopping

train_index = int(len(pad_seq) * 0.9)

train_seq = np.array(pad_seq[:train_index])
train_labels = np.array(labels[:train_index])

test_seq = np.array(pad_seq[train_index:])
test_labels = np.array(labels[train_index:])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

model = Sequential()
model.add(embedding_layer)
model.add(Dense(128,activation = 'relu'))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(32,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(softmax_size ,activation = 'softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics = ['accuracy'])

model.fit(train_seq, train_labels, epochs=10, validation_split=(1/6), batch_size=64, callbacks=[es])

# Save model
#model.save(os.path.join(MODELS_DIR, MODEL_NAME))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f7de5d955d0>

Evaluate model

In [15]:
from sklearn.metrics import classification_report

# Predict and transform predictions into binary arrays via argmax. 
predictions = model.predict(test_seq)
prediction_indexes = [indexed_map[i] for i in np.argmax(predictions, axis=-1)]

print(f"Classification report: \n{classification_report(test_labels, prediction_indexes, digits=3)}")

Classification report: 
              precision    recall  f1-score   support

           0      0.476     0.360     0.410       136
           1      0.547     0.571     0.559       133
           2      0.000     0.000     0.000         3
           3      0.165     0.265     0.203        68
           4      0.000     0.000     0.000         6
           5      0.000     0.000     0.000         5

   micro avg      0.407     0.407     0.407       351
   macro avg      0.198     0.199     0.195       351
weighted avg      0.423     0.407     0.410       351
 samples avg      0.407     0.407     0.407       351



  _warn_prf(average, modifier, msg_start, len(result))
