# Proof of Concept Strawman

Here we demonstrate how using simple methods like multiple gradient boosted trees can help achieve our goals. Inspiration taken from Talos and the UCLNLP group.

In [16]:
import pandas as pd
import tensorflow as tf
from nltk.tokenize import word_tokenize
from helpers import preprocess_data
from ngram import getBigram, getTrigram, getFourgram
from featureExtractors import CountFeatureGenerator, TfidfFeatureGenerator
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_recall_curve, plot_precision_recall_curve
import pickle

## Preprocessing the data

In [2]:
USE_CACHE = False

In [3]:
# Run this cell to read from the cache
if USE_CACHE:
    with open("df.pickle", "rb") as handle:
        training_data = pickle.loads(handle.read())
    print(f"Training Data Size: {len(training_data)} entries")

In [20]:
# Loading entire dataset into a single dataframe
if not USE_CACHE:
    bodies = pd.read_csv("train_bodies.csv").set_index('Body ID')
    training_data = pd.read_csv("train_stances.csv")
    training_data['head'] = training_data['Headline']
    training_data['body'] = bodies['articleBody'][training_data['Body ID']].reset_index(drop=True)
    del training_data['Body ID']
    del training_data['Headline']

In [21]:
# Filter out all "unrelated/discuss items"
if not USE_CACHE:
    training_data = training_data[training_data['Stance'] != 'unrelated']
    training_data = training_data[training_data['Stance'] != 'discuss']
    training_data['Stance'] = training_data['Stance'] == 'agree'
    training_data['Stance'] = training_data['Stance'].astype(int)
    training_data = training_data.reset_index(drop=True)

In [6]:
# Tokenize the respective contents
if not USE_CACHE:
    training_data['body_tokens'] = training_data.apply(lambda row: word_tokenize(row['body'].lower()), axis=1)
    training_data['head_tokens'] = training_data.apply(lambda row: word_tokenize(row['head'].lower()), axis=1)

In [9]:
# Create n-grams
if not USE_CACHE:
    training_data['body_unigrams'] = training_data.apply(lambda row: preprocess_data(row['body']), axis=1)
    training_data['body_bigrams'] = training_data.apply(lambda row: getBigram(row['body_unigrams'], '_'), axis=1)
    training_data['body_trigrams'] = training_data.apply(lambda row: getTrigram(row['body_unigrams'], '_'), axis=1)
    training_data['body_fourgrams'] = training_data.apply(lambda row: getFourgram(row['body_unigrams'], '_'), axis=1)

    training_data['head_unigrams'] = training_data.apply(lambda row: preprocess_data(row['head']), axis=1)
    training_data['head_bigrams'] = training_data.apply(lambda row: getBigram(row['head_unigrams'], '_'), axis=1)
    training_data['head_trigrams'] = training_data.apply(lambda row: getTrigram(row['head_unigrams'], '_'), axis=1)
    training_data['head_fourgrams'] = training_data.apply(lambda row: getFourgram(row['head_unigrams'], '_'), axis=1)

In [12]:
# Create cleaned text from unigrams
if not USE_CACHE:
    training_data['head_clean'] = training_data.apply(lambda row: ' '.join(row['head_unigrams']), axis=1)
    training_data['body_clean'] = training_data.apply(lambda row: ' '.join(row['body_unigrams']), axis=1)
    training_data['all_text'] = training_data.apply(lambda row: f"{row['head_clean']} {row['body_clean']}", axis=1)

In [13]:
# Run this cell to cache results
if not USE_CACHE:
    if input("Are you sure you want to save? ") == 'y':
        with open("df.pickle", 'wb') as handle:
            handle.write(pickle.dumps(training_data))
        print('saved')
    else:
        print('aborted')

Are you sure you want to save? y
saved


In [17]:
training_data.head()

Unnamed: 0,Stance,head,body,body_tokens,head_tokens,body_unigrams,body_bigrams,body_trigrams,body_fourgrams,head_unigrams,head_bigrams,head_trigrams,head_fourgrams,head_clean,body_clean,all_text
0,1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,"[hundreds, of, palestinians, were, evacuated, ...","[hundreds, of, palestinians, flee, floods, in,...","[hundr, palestinian, evacu, home, sunday, morn...","[hundr_palestinian, palestinian_evacu, evacu_h...","[hundr_palestinian_evacu, palestinian_evacu_ho...","[hundr_palestinian_evacu_home, palestinian_eva...","[hundr, palestinian, flee, flood, gaza, israel...","[hundr_palestinian, palestinian_flee, flee_flo...","[hundr_palestinian_flee, palestinian_flee_floo...","[hundr_palestinian_flee_flood, palestinian_fle...",hundr palestinian flee flood gaza israel open dam,hundr palestinian evacu home sunday morn isra ...,hundr palestinian flee flood gaza israel open ...
1,0,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...","[fear, not, arachnophobes, ,, the, story, of, ...","[spider, burrowed, through, tourist, 's, stoma...","[fear, arachnophob, stori, bunburi, spiderman,...","[fear_arachnophob, arachnophob_stori, stori_bu...","[fear_arachnophob_stori, arachnophob_stori_bun...","[fear_arachnophob_stori_bunburi, arachnophob_s...","[spider, burrow, tourist, stomach, chest]","[spider_burrow, burrow_tourist, tourist_stomac...","[spider_burrow_tourist, burrow_tourist_stomach...","[spider_burrow_tourist_stomach, burrow_tourist...",spider burrow tourist stomach chest,fear arachnophob stori bunburi spiderman might...,spider burrow tourist stomach chest fear arach...
2,1,'Nasa Confirms Earth Will Experience 6 Days of...,Thousands of people have been duped by a fake ...,"[thousands, of, people, have, been, duped, by,...","['nasa, confirms, earth, will, experience, 6, ...","[thousand, peopl, dupe, fake, news, stori, cla...","[thousand_peopl, peopl_dupe, dupe_fake, fake_n...","[thousand_peopl_dupe, peopl_dupe_fake, dupe_fa...","[thousand_peopl_dupe_fake, peopl_dupe_fake_new...","[nasa, confirm, earth, experi, day, total, dar...","[nasa_confirm, confirm_earth, earth_experi, ex...","[nasa_confirm_earth, confirm_earth_experi, ear...","[nasa_confirm_earth_experi, confirm_earth_expe...",nasa confirm earth experi day total dark decem...,thousand peopl dupe fake news stori claim nasa...,nasa confirm earth experi day total dark decem...
3,1,Banksy 'Arrested & Real Identity Revealed' Is ...,If you’ve seen a story floating around on your...,"[if, you, ’, ve, seen, a, story, floating, aro...","[banksy, 'arrested, &, real, identity, reveale...","[seen, stori, float, around, facebook, feed, b...","[seen_stori, stori_float, float_around, around...","[seen_stori_float, stori_float_around, float_a...","[seen_stori_float_around, stori_float_around_f...","[banksi, arrest, real, ident, reveal, hoax, la...","[banksi_arrest, arrest_real, real_ident, ident...","[banksi_arrest_real, arrest_real_ident, real_i...","[banksi_arrest_real_ident, arrest_real_ident_r...",banksi arrest real ident reveal hoax last year,seen stori float around facebook feed banksi g...,banksi arrest real ident reveal hoax last year...
4,1,Woman detained in Lebanon is not al-Baghdadi's...,An Iraqi official denied that a woman detained...,"[an, iraqi, official, denied, that, a, woman, ...","[woman, detained, in, lebanon, is, not, al-bag...","[iraqi, offici, deni, woman, detain, lebanon, ...","[iraqi_offici, offici_deni, deni_woman, woman_...","[iraqi_offici_deni, offici_deni_woman, deni_wo...","[iraqi_offici_deni_woman, offici_deni_woman_de...","[woman, detain, lebanon, al, baghdadi, wife, i...","[woman_detain, detain_lebanon, lebanon_al, al_...","[woman_detain_lebanon, detain_lebanon_al, leba...","[woman_detain_lebanon_al, detain_lebanon_al_ba...",woman detain lebanon al baghdadi wife iraq say,iraqi offici deni woman detain lebanon wife ab...,woman detain lebanon al baghdadi wife iraq say...


In [19]:
import io

with io.open("training_data.csv", "w") as handle:
    handle.write()

## The Simple ML Magic

In [14]:
# Generate features + respective training sets
# simple test with 75% of the dataset used for training, 25% used for testing
partition = int(0.75 * len(training_data))

features = [CountFeatureGenerator(), TfidfFeatureGenerator()]

for feature in features:
    feature.process(training_data.copy())
    
tree_training_datasets = []
for feature in features:
    for training_dataset in feature.read():
        x_train = training_dataset[:partition]
        x_test = training_dataset[partition:]
        tree_training_datasets.append((feature._name, x_train, x_test))

y_train = training_data['Stance'].values[:partition]
y_test = training_data['Stance'].values[partition:]

## Accuracy & Results

In [55]:
# Run a GradientBoostingClassifier/SVC on each feature and observe their f1 scores
for (feature, x_train, x_test) in tree_training_datasets:
    clf = GradientBoostingClassifier(n_estimators=300, max_depth=5).fit(x_train, y_train)
    guess = clf.predict(x_test)
    score = f1_score(y_test, guess)
    print(f"f1 score on {feature} using a GradientBoostingClassifier: {score} (accuracy: {(guess == y_test).mean()})")

for (feature, x_train, x_test) in tree_training_datasets:
    clf = SVC().fit(x_train, y_train)
    guess = clf.predict(x_test)
    score = f1_score(y_test, guess)
    print(f"f1 score on {feature} using an SVC: {score} (accuracy: {(guess == y_test).mean()})")

f1 score on countFeatureGenerator using a GradientBoostingClassifier: 0.9089968976215098 (accuracy: 0.8442477876106195)
f1 score on tfidfFeatureGenerator using a GradientBoostingClassifier: 0.8947087119187601 (accuracy: 0.8256637168141593)
f1 score on tfidfFeatureGenerator using a GradientBoostingClassifier: 0.8952991452991453 (accuracy: 0.8265486725663717)
f1 score on tfidfFeatureGenerator using a GradientBoostingClassifier: 0.8789873417721519 (accuracy: 0.7884955752212389)
f1 score on countFeatureGenerator using an SVC: 0.9045079980610762 (accuracy: 0.8256637168141593)
f1 score on tfidfFeatureGenerator using an SVC: 0.9030883919062832 (accuracy: 0.8389380530973451)
f1 score on tfidfFeatureGenerator using an SVC: 0.9030883919062832 (accuracy: 0.8389380530973451)
f1 score on tfidfFeatureGenerator using an SVC: 0.9045079980610762 (accuracy: 0.8256637168141593)


In [56]:
y_test.mean()

0.8256637168141593

In [28]:
# Merge all tables into one giant table, and run a few ensemble methods on it
master_df = pd.DataFrame()
for (_, x_train, x_test) in tree_training_datasets:
    master_df

## Conclusions