# fasttext
---


In [None]:
import os
import random
import re

import fasttext
import fasttext.util
import nltk
import numpy as np
import pandas as pd
import wandb
from config import DATASETS, FASTTEXT_PATH, PROJECT_NAME
from nltk.corpus import stopwords  # type: ignore
from nltk.stem import WordNetLemmatizer  # type: ignore
from sklearn.base import BaseEstimator  # type: ignore
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split


In [None]:
MODEL_NAME = "fasttext"
TRACKING = True
FALSE_CLASS_BALANCE = 1.0

dataset = DATASETS["dataset_2014"]

In [None]:
def load_dataset(dataset_path: str, split_size: float=0.2, false_class_balance: float=1.0) -> pd.DataFrame:
    """Function to load the dataset.

    Returns:
        X_train (DatFrame): Train data
        X_test (DatFrame): Test data
        y_train (DatFrame): Train label
        y_test (DatFrame): Test label
    """
    data = pd.read_csv(os.path.join(dataset_path))  # load Data

    claims = data[data["Claim"] == True]

    n_samples = int(len(claims) * false_class_balance)
    no_claims = data[data["Claim"] == False].sample(n=n_samples, random_state=42)
    data_sample = pd.concat([claims, no_claims])

    X_train, X_test, y_train, y_test = train_test_split(
        data_sample, data_sample["Claim"], test_size=split_size, random_state=0
    )
    return X_train, X_test, y_train, y_test

In [None]:
class FastTextPreprocessing(BaseEstimator):
    """Prepare the dataset for fasttext"""

    def get_feature_names(self):
        return [self.__class__.__name__]

    def fasttext_preprocessing(self, document):
        """Preprocessing pipeline from: https://stackabuse.com/python-for-nlp-working-with-facebook-fasttext-library/"""
        document = re.sub(r'\W', ' ', str(document))  # Remove all the special characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)  # remove all single characters
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)  # Remove single characters from the start
        document = re.sub(r'\s+', ' ', document, flags=re.I)  # Substituting multiple spaces with single space
        document = re.sub(r'^b\s+', '', document)  # Removing prefixed 'b'
        document = document.lower()  # Converting to Lowercase

        en_stop = set(stopwords.words('english'))
        
        # Lemmatization
        tokens = document.split()
        tokens = [self.stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
        
    def fit(self, X, y):
        self.stemmer = WordNetLemmatizer()
        return self

    def transform(self, X, y, name):
        path = os.path.join(FASTTEXT_PATH, "dataset_" + name + ".txt")
        with open(path, 'w', encoding='utf-8') as outFile:
            for sentence, label in zip(X, y):
                preprcessed_sentence = self.fasttext_preprocessing(sentence)
                preprcessed_label = "__label__claim" if label == True else "__label__no_claim"

                processed_data = preprcessed_label + " " + preprcessed_sentence

                outFile.write(processed_data)
                outFile.write("\n")

        return path


### 0. Load data

In [None]:
train_text_split, X_test, train_labels_split, y_test = load_dataset(dataset_path=os.path.join(dataset["base_path"], dataset["data"]), false_class_balance=FALSE_CLASS_BALANCE)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_text_split, train_labels_split, test_size=.2, random_state=42) # train/test

### 1. Encode Features

In [None]:
text_feature = FastTextPreprocessing()
text_feature = text_feature.fit(None, None)

In [None]:
train_data_path = text_feature.transform(
    X_train["Sentence"].to_list(),
    y_train.to_list(), 
    "train"
)
validation_data_path = text_feature.transform(
    X_train["Sentence"].to_list(),
    y_train.to_list(), 
    "validate"
)

test_data_path = text_feature.transform(
    X_test["Sentence"].to_list(),
    y_test.to_list(), 
    "test"
)

### 2. Train Embeddings

In [None]:
model = fasttext.train_unsupervised(train_data_path)

### 3. Train classifyer

In [None]:
with open(train_data_path, "r") as inFile:
   len_train = len(inFile.readlines())
with open(validation_data_path, "r") as inFile:
   len_val = len(inFile.readlines())
with open(test_data_path, "r") as inFile:
   len_test = len(inFile.readlines())

In [None]:
model = fasttext.train_supervised(input=train_data_path, autotuneValidationFile=validation_data_path)

In [None]:
if TRACKING:
  wandb.init(project=PROJECT_NAME,
          config={
              "model": MODEL_NAME,
              "setup": "autotuneValidation",
              "dataset": dataset["name"],
              "train_data_size": len_train,
              "validation_data_size": len_val,
              "test_data_size": len_test,
              "batch_size": None,
              "learning_rate": model.lr,
              "epochs": model.epoch,
              "false_class_balance": FALSE_CLASS_BALANCE
          })

### 4. Evaluate the model

In [None]:
model.test(test_data_path)  # n, precision, recall

In [None]:
sentences = X_test["Sentence"].to_list()
labels = y_test.to_list()
y_pred = []

for sentence in sentences:
  label, confidence = model.predict(sentence)
  y_pred.append(False if label[0]== "__label__no_claim" else True)

In [None]:
print(classification_report(labels, y_pred))

In [None]:
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
    wandb.finish()