<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/create_fasttext_random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trained fasttext model using wiki news word embeddings


Init environment

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)ž

Install FastText

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip
!cd fastText-0.9.2 && make && pip install .

Preprocessing init

In [2]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
sys.path.insert(1, 'drive/MyDrive/nlp/nlp-offensive-language/src')

from preprocessing import *

Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)


In [10]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"
FASTTEXT_DIR = "drive/MyDrive/nlp/embeddings/fasttext/"

MODELS_DIR = "drive/MyDrive/nlp/models/"
MODEL_NAME = "frenk_migrants_fasttext_random_en"

#DATASET = "english/fox_news/dataset.csv"
#DATASET = "english/gab_and_reddit/dataset.csv"
#DATASET = "english/deep_offense/dataset.csv"
#DATASET = "english/trac_2/dataset.csv"
#DATASET = "english/wiki_detox/dataset_aggression.csv"
#DATASET = "english/wiki_detox/dataset_attack.csv"
#DATASET = "english/wiki_detox/dataset_toxicity.csv"
#DATASET = "english/frenk_lgbt/dataset.csv"
DATASET = "english/frenk_migrants/dataset.csv"
#DATASET = "english/combined_preprocessed/combined_dataset_train.csv"

Preprocess texts

In [None]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

csv_read = csv.reader(open(os.path.join(DATASETS_DIR, DATASET), encoding="utf8"), delimiter=",")

content = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_NUMBERS, PP_REMOVE_EMOJI])
  text = " ".join(text)

  label = ""
  if "OFF" in line[2]:
    label = "__label__OFF"
  else:
    label = "__label__NOT"

  content.append(label + " " + text + "\n")

print("Loaded dataset")
print(str(len(content)) + " texts")
print("First: " + content[0])

Save content to files

In [12]:
filename_train = "content.train"
filename_test = "content.test"

total_count = len(content)
train_count = int(0.8 * total_count)

train_content = content[0:train_count]
test_content = content[train_count:]

f = open(filename_train, "w")
f.writelines(train_content)
f.close()

f = open(filename_test, "w")
f.writelines(test_content)
f.close()


Build Fasttext model

In [13]:
import fasttext

model = fasttext.train_supervised(input="content.train", dim=300, epoch=15, lr=0.9, wordNgrams=2)
model.save_model(os.path.join(MODELS_DIR, MODEL_NAME))

Evaluate fasttext


In [None]:
predicts = []
count = 0
for i in test_content:
  count = count + 1
  x = i[:len(i)-2] # remove \n
  predicts.append(model.predict(x))

total = len(test_content)
TP = 0
TN = 0
FP = 0
FN = 0

count = 0
for i in range(total):
  test_sample = test_content[i]
  prediction = predicts[i]

  if test_sample.startswith("__label__OFF"):
    if prediction[0][0] == "__label__OFF":
      TP += 1
    else:
      FN += 1
  else:
    if prediction[0][0] == "__label__OFF":
      FP += 1
    else:
      TN += 1

precision = TP / ((TP + FP))
recall = TP / ((TP + FN))
f1 = 2 * precision * recall / (precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1))
print("Accuracy: " + str(str((TP + TN) / total)))