In [2]:
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
import random

In [3]:
# Function to load and split the data into training and development sets
def load_data(train_data, limit=0, split=0.8):
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"REAL": not bool(y), "FAKE": bool(y)} for y in labels]
    split = int(len(train_data) * split)
    
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [4]:
# Function to evaluate the model's performance
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "FAKE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [6]:
# Load English language model
nlp = spacy.load('en_core_web_md')

# Load English news dataset CSV file into DataFrame
df = pd.read_csv('./Dataset/bert_dataset.csv')

In [7]:
# Clean the text data
df.replace(to_replace='[\n\r\t]', value=' ', regex=True, inplace=True)

In [9]:
# Create text categorization pipeline with exclusive classes
config = {
    "model": {
        "@architectures": "spacy.TextCatEnsemble.v2",
        "linear_model": {
            "@architectures": "spacy.TextCatBOW.v1",
            "exclusive_classes": True,
            "nO": None,
        },
        "tok2vec": {
            "@architectures": "spacy.Tok2Vec.v1",
            "embed": {"@architectures": "spacy.MultiHashEmbed.v1", "width": 64, "rows": [2000, 2000, 1000, 1000, 1000], "attrs": ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE"], "include_static_vectors": False},
            "encode": {"@architectures": "spacy.MaxoutWindowEncoder.v2", "width": 64, "depth": 2, "window_size": 1, "maxout_pieces": 3},
        }
    }
}

textcat = nlp.create_pipe("textcat", config=config)
nlp.add_pipe(textcat, last=True)
nlp.pipe_names


ConfigValidationError: 

Config validation error
textcat.model.linear_model -> ngram_size	field required
textcat.model.linear_model -> no_output_layer	field required
{'@architectures': 'spacy.TextCatBOW.v1', 'exclusive_classes': True, 'nO': None}

In [None]:
# Add label categories to the text categorization pipeline
textcat.add_label("REAL")
textcat.add_label("FAKE")

In [None]:
# Combine 'title' and 'content' columns to form the input text
df['combined_text'] = df['title'] + ' ' + df['content']


In [None]:
# Create tuples of combined text and labels for training
df['tuples'] = df.apply(lambda row: (row['combined_text'], row['is_fake']), axis=1)
train = df['tuples'].tolist()

In [None]:
# Split data into training and development sets
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(train, split=0.9)
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))

In [None]:
# Number of iterations for training
n_iter = 20


In [None]:
# Disabling other pipeline components to train only the text categorization component
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
   
    # Training the model
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)

        # Evaluate model performance on development set
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

In [None]:
# Save trained model to disk
with nlp.use_params(optimizer.averages):
    nlp.to_disk('model')