<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/gab_and_reddit_fasttext_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Init environment

In [18]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass

In [25]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# How I got Glove6B embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!ls
#!unzip glove.6B.zip
#!mkdir drive/MyDrive/nlp/models-pre/glove6B
#!mv *d.txt drive/MyDrive/nlp/models-pre/glove6B

Install FastText

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip
!cd fastText-0.9.2 && make && pip install .

In [5]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
#sys.path.insert(1, 'drive/MyDrive/nlp/nlp-offensive-language/src')
print(sys.path)
from preprocessing import *



Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)
['', 'drive/MyDrive/nlp/nlp-offensive-language/src', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython']


In [79]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"

GLOVE_DIR = "drive/MyDrive/nlp/models-pre/glove6B/"
FASTTEXT_DIR = "drive/MyDrive/nlp/models-pre/fasttext/"

In [73]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

#filename = "english/embeddia/dataset.csv"
filename = "english/gab_and_reddit/dataset.csv"
csv_read = csv.reader(open(os.path.join(DATASETS_DIR, filename), encoding="utf8"), delimiter=",")

content = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_NUMBERS])
  text = " ".join(text)

  label = ""
  if line[2] == "OFF":
    label = "__label__OFF"
  else:
    label = "__label__NOT"

  content.append(label + " " + text + "\n")

print("Loaded dataset")
print(str(len(content)) + " texts")
print("First: " + content[0])

Loaded dataset
56085 texts
First: __label__OFF i joined gab to remind myself how retarded jew haters are you would not be typing on your abacus without them you retard



Save content to files

In [74]:
filename_train = "content.train"
filename_test = "content.test"

total_count = len(content)
train_count = int(0.8 * total_count)

train_content = content[0:train_count]
test_content = content[train_count:]

f = open(filename_train, "w")
f.writelines(train_content)
f.close()

f = open(filename_test, "w")
f.writelines(test_content)
f.close()


Build Fasttext model

In [83]:
import fasttext

pretrained_vectors = FASTTEXT_DIR + "wiki-news-300d-1M.vec"

model = fasttext.train_supervised(input="content.train", dim=300, epoch=15, lr=0.9, wordNgrams=2, pretrainedVectors = pretrained_vectors)

Evaluate fasttext


In [84]:
predicts = []
for i in test_content:
  x = i[:len(i)-2] # remove \n
  predicts.append(model.predict(x))

total = len(test_content)
TP = 0
TN = 0
FP = 0
FN = 0

count = 0
for i in range(total):
  test_sample = test_content[i]
  prediction = predicts[i]
  
  if test_sample.startswith("__label__OFF"):
    if prediction[0][0] == "__label__OFF":
      TP += 1
    else:
      FN += 1
  else:
    if prediction[0][0] == "__label__OFF":
      FP += 1
    else:
      TN += 1
    

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1))
print("Accuracy: " + str(str((TP + TN) / total)))

Precision: 0.6494057724957555
Recall: 0.5347780496329955
F1 Score: 0.58654399079931
Accuracy: 0.8077025942765446


In [None]:
import os
import numpy as np
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Prepare tokenizer given loaded texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

vocab_size = len(tokenizer.word_index)+1

# Load Glove model
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100
pad_seq = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH)
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Create embedding matrix

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Create embedding layer

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Created embedding layer

Found 400000 word vectors.
