In [2]:
import pandas as pd
import spacy
from spacy.training import Example
import random

def load_data(train_data, limit=0, split=0.8):
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"REAL": not bool(y), "FAKE": bool(y)} for y in labels]
    split = int(len(train_data) * split)
    
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = [tokenizer(text) for text in texts]
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "FAKE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}



In [5]:
n_iter = 20
# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
   
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
for i in range(n_iter):
    losses = {}
    random.shuffle(train_data)
    for batch in spacy.util.minibatch(train_data, size=spacy.util.compounding(4., 32., 1.001)):
        nlp.update(batch, drop=0.2, losses=losses)
    
    # Print losses after each iteration
    print("Iteration:", i + 1, "Losses:", losses)

    # Calling the evaluate() function and printing the scores
    scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
    print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
          .format(losses['textcat'], scores['textcat_p'],
                  scores['textcat_r'], scores['textcat_f']))


KeyError: 'text'

In [6]:
nlp = spacy.blank('el')
textcat = nlp.add_pipe("textcat", config={"threshold": 0.0})
textcat.add_label("REAL")
textcat.add_label("FAKE")

df = pd.read_csv('./processed_dataset.csv')
df.replace(to_replace='[\n\r\t]', value=' ', regex=True, inplace=True)

In [7]:
df['tuples'] = df.apply(lambda row: (row['text'], row['is_fake']), axis=1)
train = df['tuples'].tolist()

(train_texts, train_cats), (dev_texts, dev_cats) = load_data(train, split=0.9)

train_data = []
for text, cats in zip(train_texts, train_cats):
    train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cats}))

KeyError: 'text'

In [6]:
nlp.to_disk('model')