In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("data/mix.csv")

#check dataset balance
df['label'].value_counts()

False    23481
True     21417
Name: label, dtype: int64

In [3]:
df["text"]=df["title"]+df["text"]
df = df[['text','label']].dropna()
df.head(10)

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,False
1,Drunk Bragging Trump Staffer Started Russian ...,False
2,Sheriff David Clarke Becomes An Internet Joke...,False
3,Trump Is So Obsessed He Even Has Obama’s Name...,False
4,Pope Francis Just Called Out Donald Trump Dur...,False
5,Racist Alabama Cops Brutalize Black Boy While...,False
6,"Fresh Off The Golf Course, Trump Lashes Out A...",False
7,Trump Said Some INSANELY Racist Stuff Inside ...,False
8,Former CIA Director Slams Trump Over UN Bully...,False
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,False


In [7]:
# Import spaCy ,load model
import spacy
nlp=spacy.load("en_core_web_sm")
nlp.pipe_names

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [9]:
# Adding the built-in textcat component to the pipeline.
textcat=nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

ConfigValidationError: 

Config validation error

textcat -> architecture        extra fields not permitted
textcat -> exclusive_classes   extra fields not permitted

{'nlp': <spacy.lang.en.English object at 0x7fe4c83a3438>, 'name': 'textcat', 'architecture': 'simple_cnn', 'exclusive_classes': True, 'model': {'@architectures': 'spacy.TextCatEnsemble.v2', 'linear_model': {'@architectures': 'spacy.TextCatBOW.v2', 'exclusive_classes': True, 'ngram_size': 1, 'no_output_layer': False}, 'tok2vec': {'@architectures': 'spacy.Tok2Vec.v2', 'embed': {'@architectures': 'spacy.MultiHashEmbed.v2', 'width': 64, 'rows': [2000, 2000, 1000, 1000, 1000, 1000], 'attrs': ['ORTH', 'LOWER', 'PREFIX', 'SUFFIX', 'SHAPE', 'ID'], 'include_static_vectors': False}, 'encode': {'@architectures': 'spacy.MaxoutWindowEncoder.v2', 'width': 64, 'window_size': 1, 'maxout_pieces': 3, 'depth': 2}}}, 'threshold': 0.5, '@factories': 'textcat'}

In [8]:
# Adding the labels to textcat
textcat.add_label("TRUE")
textcat.add_label("FAKE")

1

In [None]:
# Converting the dataframe into a list of tuples
df['tuples'] = df.apply(lambda row: (row['text'],row['label']), axis=1)
train =df['tuples'].tolist()
train[:10]

In [None]:
import random

def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [{"TRUE": bool(y), "FAKE": not bool(y)} for y in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

n_texts=23486

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:10]

In [11]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "FAKE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


#("Number of training iterations", "n", int))
n_iter=10

In [None]:
from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

In [None]:
# Testing the model
test_text="I hate this dress"
doc=nlp(test_text)
doc.cats 