<a href="https://colab.research.google.com/github/kozeljko/nlp-models/blob/master/create_multi_fasttext_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trained fasttext model using wiki news word embeddings


Init environment

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Install FastText

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip
!cd fastText-0.9.2 && make && pip install .

Preprocessing init

In [None]:
!pip install lemmagen3 emoji
!pip install --upgrade keras
!pip install tensorflow-addons
!pip install scikit-learn

import sys
sys.path.insert(1, 'drive/MyDrive/nlp/nlp-offensive-language/src')

from preprocessing import *

In [15]:
DATASETS_DIR = "drive/MyDrive/nlp/nlp-offensive-language/datasets/"
FASTTEXT_DIR = "drive/MyDrive/nlp/embeddings/fasttext/"

MODELS_DIR = "drive/MyDrive/nlp/models/"
MODEL_NAME = "frenk_lgbt_multi_fasttext_wiki_sl"

#LANGUAGE = "english"
#DATASET = "english/fox_news/dataset.csv"
#DATASET = "english/gab_and_reddit/dataset.csv"
#DATASET = "english/deep_offense/dataset.csv"
#DATASET = "english/trac_2/dataset.csv"
#DATASET = "english/wiki_detox/dataset_aggression.csv"
#DATASET = "english/wiki_detox/dataset_attack.csv"
#DATASET = "english/wiki_detox/dataset_toxicity.csv"
#DATASET = "english/frenk_lgbt/dataset.csv"
#DATASET = "english/frenk_migrants/dataset.csv"
#DATASET = "english/combined_preprocessed/combined_dataset_train.csv"

LANGUAGE = "slovene"
#DATASET = "slovenian/frenk_migrants/dataset.csv"
DATASET = "slovenian/frenk_lgbt/dataset.csv"

Preprocess texts

In [16]:
import os, csv
import numpy as np
import pandas as pd
from preprocessing import *

csv_read = csv.reader(open(os.path.join(DATASETS_DIR, DATASET), encoding="utf8"), delimiter=",")

content = []
for line in csv_read:
  if line[0] == "id":
    continue

  text = line[1]
  text = preprocess(text, [PP_LOWERCASE, PP_REMOVE_USERNAME_HANDLES, PP_REMOVE_URLS, PP_REMOVE_SPECIAL_CHARACTERS, PP_REMOVE_NUMBERS, PP_REMOVE_EMOJI], language=LANGUAGE)
  text = " ".join(text)

  label = "__label__" + line[2]

  content.append(label + " " + text + "\n")

print("Loaded dataset")
print(str(len(content)) + " texts")
print("First: " + content[0])

Loaded dataset
3608 texts
First: __label__NOT kako omogoča saj pa oni že lahko sklenejo zvezo



Save content to files

In [17]:
filename_train = "content.train"
filename_test = "content.test"

total_count = len(content)
train_count = int(0.8 * total_count)

train_content = content[0:train_count]
test_content = content[train_count:]

f = open(filename_train, "w")
f.writelines(train_content)
f.close()

f = open(filename_test, "w")
f.writelines(test_content)
f.close()


Build Fasttext model

In [18]:
import fasttext

pretrained_vectors = FASTTEXT_DIR + "wiki.sl.vec"

model = fasttext.train_supervised(input="content.train", dim=300, epoch=15, lr=0.9, wordNgrams=2, pretrainedVectors = pretrained_vectors)
model.save_model(os.path.join(MODELS_DIR, MODEL_NAME))

Evaluate fasttext


In [None]:
from sklearn.metrics import classification_report

predicts = []
count = 0
for i in test_content:
  count = count + 1
  x = i[:len(i)-2] # remove \n
  predicts.append(model.predict(x))

targets = []
predictions = []
for i in range(count):
  test_sample = test_content[i]
  targets.append(test_sample.split(" ")[0])
  predictions.append(predicts[i][0][0])

print(f"Classification report: \n{classification_report(targets, predictions, digits=3)}")