<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/fasttext_random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Init environment

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Install FastText

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip
!cd fastText-0.9.2 && make && pip install .

Preprocessing init

In [None]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons

import sys
sys.path.insert(1, 'drive/MyDrive/nlp/nlp-offensive-language/src')

from preprocessing import *

In [None]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"

Preprocess texts

In [None]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

#filename = "english/fox_news/dataset.csv"
#filename = "english/gab_and_reddit/dataset.csv"
#filename = "english/deep_offense/dataset.csv"
#filename = "english/trac_2/dataset.csv"
filename = "english/wiki_detox/dataset_aggression.csv"
#filename = "english/wiki_detox/dataset_attack.csv"
#filename = "english/wiki_detox/dataset_toxicity.csv"
csv_read = csv.reader(open(os.path.join(DATASETS_DIR, filename), encoding="utf8"), delimiter=",")

content = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_NUMBERS])
  text = " ".join(text)

  label = ""
  if line[2] == "OFF":
    label = "__label__OFF"
  else:
    label = "__label__NOT"

  content.append(label + " " + text + "\n")

print("Loaded dataset")
print(str(len(content)) + " texts")
print("First: " + content[0])

Save content to files

In [None]:
filename_train = "content.train"
filename_test = "content.test"

total_count = len(content)
train_count = int(0.8 * total_count)

train_content = content[0:train_count]
test_content = content[train_count:]

f = open(filename_train, "w")
f.writelines(train_content)
f.close()

f = open(filename_test, "w")
f.writelines(test_content)
f.close()


Build Fasttext model

In [None]:
import fasttext

model = fasttext.train_supervised(input="content.train", epoch=15, lr=0.9, wordNgrams=2)

Evaluate Fasttext model


In [None]:
predicts = []
for i in test_content:
  x = i[:len(i)-2] # remove \n
  predicts.append(model.predict(x))

total = len(test_content)
TP = 0
TN = 0
FP = 0
FN = 0

count = 0
for i in range(total):
  test_sample = test_content[i]
  prediction = predicts[i]
  
  if test_sample.startswith("__label__OFF"):
    if prediction[0][0] == "__label__OFF":
      TP += 1
    else:
      FN += 1
  else:
    if prediction[0][0] == "__label__OFF":
      FP += 1
    else:
      TN += 1
    

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1))
print("Accuracy: " + str(str((TP + TN) / total)))