# fasttext
---
Features:
- Sentence

Parameter:

In [None]:
from google.colab import drive


In [2]:
!pip install wandb -qqq

In [28]:
import fasttext
import fasttext.util

import os
import pandas as pd
import numpy as np
import random
import timeit
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

import wandb
from src.dataset import load_dataset, fasttext_preprocessing

from config import FASTTEXT_PATH, NLTK_DATA_PATH

### 0. Load data

In [20]:
nltk.data.path.append(NLTK_DATA_PATH)

In [21]:
data = load_dataset()

In [22]:
train_split, test_texts = train_test_split(data, test_size=.2, random_state=42) # train/test
train_texts, val_texts = train_test_split(train_split, test_size=.2, random_state=42) # train/validate

### 1. Encode Features

In [24]:
def fasttext_label_encoder(data, name):
    # encode
    data["Claim"] = data.apply(lambda x: "__label__claim " if x["Claim"] == True else "__label__no_claim ", axis=1)  # prepare labels for fasttext
    data["Sentence"] = data.apply(lambda x: fasttext_preprocessing(x["Sentence"]), axis=1)  # preprocess text
    processed_data = (data['Claim'] + " " + data['Sentence']).to_list()

    # write
    path = os.path.join(FASTTEXT_PATH, "dataset" + name)
    with open(path, 'w', encoding='utf-8') as outFile:
        for line in processed_data:
          outFile.write(line)
          outFile.write("\n")
    return path

In [25]:
train_data_path = fasttext_label_encoder(train_texts, "_train.txt")
validation_data_path = fasttext_label_encoder(val_texts, "_validate.txt")
test_data_path = fasttext_label_encoder(test_texts, "_test.txt")

### 2. Train Embeddings

In [14]:
model = fasttext.train_unsupervised(train_data_path)

Read 0M words
Number of words:  1097
Number of labels: 2
Progress: 100.0% words/sec/thread:  126889 lr:  0.000000 avg.loss:  2.935374 ETA:   0h 0m 0s


### 3. Train classifyer

In [15]:
with open(train_data_path, "r") as inFile:
   len_train = len(inFile.readlines())
with open(validation_data_path, "r") as inFile:
   len_val = len(inFile.readlines())
with open(test_data_path, "r") as inFile:
   len_test = len(inFile.readlines())

In [16]:
model = fasttext.train_supervised(input=train_data_path, autotuneValidationFile=validation_data_path, autotuneDuration=600)

Progress: 100.0% Trials:  101 Best score:  0.769620 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  5873
Number of labels: 2
Progress: 100.0% words/sec/thread:   30434 lr:  0.000000 avg.loss:  0.358404 ETA:   0h 0m 0s


### 4. Evaluate the model

In [33]:
sentences = test_texts["Sentence"].to_list()
labels = test_texts["Claim"].to_list()
predictions = []

for sentence in sentences:
  label, confidence = model.predict(sentence)
  predictions.append(label[0]+" ")

In [36]:
print(classification_report(labels, predictions))

                    precision    recall  f1-score   support

   __label__claim        0.69      0.72      0.70       244
__label__no_claim        0.71      0.68      0.70       250

          accuracy                           0.70       494
         macro avg       0.70      0.70      0.70       494
      weighted avg       0.70      0.70      0.70       494



In [39]:
fasttext.util.download_model('en', if_exists='ignore')  # English

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [None]:
ft = fasttext.load_model('cc.en.300.bin')

In [None]:
ft.save_model('data/fasttext/cc.en.100.bin')

mm = fasttext.load_model("")

In [None]:
ft = fasttext.load_model(MODEL_PATH)

# 2. Load model
---


In [10]:
ft = fasttext.load_model(MODEL_PATH)



# 3. Train Model
---

In [12]:
wandb.init(project="claim_detect_en",
           config={
               "model": MODEL_NAME,
               "dataset": DATASET,
               "train_data_size": len_train,
               "validation_data_size": len_val,
               "test_data_size": len_test,
           })

[34m[1mwandb[0m: Currently logged in as: [33mjueri[0m (use `wandb login --relogin` to force relogin)


In [13]:
start = timeit.default_timer()

model = fasttext.train_supervised(input=train_data_path, autotuneValidationFile=validation_data_path, autotuneDuration=600)

stop = timeit.default_timer()

print('Time Elapsed: ', stop - start)
wandb.log({'time-elapsed': stop - start})

Time Elapsed:  601.60161913


# 5. Evaluate the model

In [14]:
with open(test_data_path, "r") as file:
    lines = file.readlines()
test_data = pd.read_csv(test_data_path, delimiter="  ", names=["Label", "Sentence"]).dropna()
test_data["Class"] = test_data.apply(lambda x: 1 if x["Label"] == "__label__claim" else 0, axis=1)


  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
sentences = test_data["Sentence"].to_list()
labels = test_data["Class"].to_list()
predictions = []

for sentence in sentences:
  label, confidence = model.predict(sentence)
  predictions.append(1 if label[0] == "__label__claim" else 0)

In [17]:
# wandb.init(project="jupyter-projo")

wandb.log({'test_accuracy': accuracy})
wandb.log({'test_accuracy': recall[0]})
wandb.log({'test_accuracy': fscore[0]})
wandb.log({'test_accuracy': precision[0]})


wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test_accuracy,▁█▄▁
time-elapsed,▁

0,1
test_accuracy,0.97436
time-elapsed,601.60162


# Predict

In [24]:
model.predict("Which baking dish is best to bake a banana bread ?")

(('__label__no_claim',), array([0.50003022]))

# Export model

In [None]:
model_name = wandb.name

In [None]:
model.save_model(os.path.join(BASE_DIR, MODEL_NAME))

('drive/MyDrive/BA/fearful-poltergeist-1/tokenizer_config.json',
 'drive/MyDrive/BA/fearful-poltergeist-1/special_tokens_map.json',
 'drive/MyDrive/BA/fearful-poltergeist-1/vocab.txt',
 'drive/MyDrive/BA/fearful-poltergeist-1/added_tokens.json')

#Load model

In [None]:
import fasttext
loaded_model = fasttext.load_model(os.path.join(BASE_DIR, MODEL_NAME))

Some layers from the model checkpoint at drive/MyDrive/BA/test_output were not used when initializing TFDistilBertForSequenceClassification: ['dropout_59']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at drive/MyDrive/BA/test_output and are newly initialized: ['dropout_116']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
