In [1]:
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import gzip
import pandas as pd
import numpy as np
import fasttext
from data_utils import load_opinion_data

## Prepare data

In [7]:
x_train, y_train, x_val, y_val, x_test, y_test = load_opinion_data(val_size=0.1)

x_train.shape (100591,)
y_train.shape (100591,)
x_val.shape (11176,)
y_val.shape (11176,)
x_test.shape (12418,)
y_test.shape (12418,)


In [6]:
train_path = f'../articles_wapo_fasttext_train.txt'
valid_path = f'../articles_wapo_fasttext_val.txt'
test_path = f'../articles_wapo_fasttext_test.txt'

y_train_str = ['opinion' if label == 1 else 'news' for label in y_train]
y_val_str = ['opinion' if label == 1 else 'news' for label in y_val]
y_test_str = ['opinion' if label == 1 else 'news' for label in y_test]

with open(train_path, 'w') as f:
    for idx in range(len(x_train)):
        f.write(f'__label__{y_train_str[idx]} {x_train[idx]}\n')

with open(valid_path, 'w') as f:
    for idx in range(len(x_val)):
        f.write(f'__label__{y_val_str[idx]} {x_val[idx]}\n')

with open(test_path, 'w') as f:
    for idx in range(len(x_test)):
        f.write(f'__label__{y_test_str[idx]} {x_test[idx]}\n')

In [3]:
def evaluate_model(model, path_to_validation):
    results = model.test_label(path_to_validation)
    opinion = results['__label__opinion']
    news = results['__label__news']
    opinion = (opinion['precision'], opinion['recall'])
    news = (news['precision'], news['recall'])

    print(f"Opinion — Precision: {opinion[0]} – Recall: {opinion[1]}")
    print(f"News — Precision: {news[0]} – Recall: {news[1]}")

### Should later evaluate the model manually, because FastText messes up Recall calculation.

## Train & Evaluate on validation and test sets

In [4]:
model = fasttext.train_supervised(train_path, epoch=5, wordNgrams=1, lr=0.8)

evaluate_model(model, valid_path)

Opinion — Precision: 0.8358112475759535 – Recall: nan
News — Precision: 0.9582511164191505 – Recall: nan


In [5]:
evaluate_model(model, test_path)

Opinion — Precision: 0.8420427553444181 – Recall: nan
News — Precision: 0.9578908142351407 – Recall: nan
