# Proof of Concept Strawman

Here we demonstrate how using simple methods like multiple gradient boosted trees can help achieve our goals. Inspiration taken from Talos and the UCLNLP group.

In [1]:
import pandas as pd
import tensorflow as tf
from nltk.tokenize import word_tokenize
from helpers import preprocess_data
from ngram import getBigram, getTrigram, getFourgram
from featureExtractors import CountFeatureGenerator, TfidfFeatureGenerator
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_recall_curve, plot_precision_recall_curve
import pickle

## Preprocessing the data

In [2]:
USE_CACHE = True

In [3]:
# Run this cell to read from the cache
if USE_CACHE:
    with open("df.pickle", "rb") as handle:
        training_data = pickle.loads(handle.read())
    print(f"Training Data Size: {len(training_data)} entries")

Training Data Size: 4518 entries


In [4]:
# Loading entire dataset into a single dataframe
if not USE_CACHE:
    bodies = pd.read_csv("train_bodies.csv").set_index('Body ID')
    training_data = pd.read_csv("train_stances.csv")
    training_data['head'] = training_data['Headline']
    training_data['body'] = bodies['articleBody'][training_data['Body ID']].reset_index(drop=True)
    del training_data['Body ID']
    del training_data['Headline']
    

In [5]:
# Filter out all "unrelated/discuss items"
if not USE_CACHE:
    training_data = training_data[training_data['Stance'] != 'unrelated']
    training_data = training_data[training_data['Stance'] != 'discuss']
    training_data['Stance'] = training_data['Stance'] == 'agree'
    training_data['Stance'] = training_data['Stance'].astype(int)
    training_data = training_data.reset_index(drop=True)

In [6]:
# Tokenize the respective contents
if not USE_CACHE:
    training_data['body_tokens'] = training_data.apply(lambda row: word_tokenize(row['body'].lower()), axis=1)
    training_data['head_tokens'] = training_data.apply(lambda row: word_tokenize(row['head'].lower()), axis=1)

In [7]:
# Create n-grams
if not USE_CACHE:
    training_data['body_unigrams'] = training_data.apply(lambda row: preprocess_data(row['body']), axis=1)
    training_data['body_bigrams'] = training_data.apply(lambda row: getBigram(row['body_unigrams'], '_'), axis=1)
    training_data['body_trigrams'] = training_data.apply(lambda row: getTrigram(row['body_unigrams'], '_'), axis=1)
    training_data['body_fourgrams'] = training_data.apply(lambda row: getFourgram(row['body_unigrams'], '_'), axis=1)

    training_data['head_unigrams'] = training_data.apply(lambda row: preprocess_data(row['head']), axis=1)
    training_data['head_bigrams'] = training_data.apply(lambda row: getBigram(row['head_unigrams'], '_'), axis=1)
    training_data['head_trigrams'] = training_data.apply(lambda row: getTrigram(row['head_unigrams'], '_'), axis=1)
    training_data['head_fourgrams'] = training_data.apply(lambda row: getFourgram(row['head_unigrams'], '_'), axis=1)

In [8]:
# Create cleaned text from unigrams
if not USE_CACHE:
    training_data['head_clean'] = training_data.apply(lambda row: ' '.join(row['head_unigrams']), axis=1)
    training_data['body_clean'] = training_data.apply(lambda row: ' '.join(row['body_unigrams']), axis=1)
    training_data['all_text'] = training_data.apply(lambda row: f"{row['head_clean']} {row['body_clean']}", axis=1)

In [9]:
# Run this cell to cache results
if not USE_CACHE:
    if input("Are you sure you want to save? ") == 'y':
        with open("df.pickle", 'wb') as handle:
            handle.write(pickle.dumps(training_data))
        print('saved')
    else:
        print('aborted')

In [10]:
training_data.head()

Unnamed: 0,Stance,head,body,body_tokens,head_tokens,body_unigrams,body_bigrams,body_trigrams,body_fourgrams,head_unigrams,head_bigrams,head_trigrams,head_fourgrams,head_clean,body_clean,all_text
0,1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,"[hundreds, of, palestinians, were, evacuated, ...","[hundreds, of, palestinians, flee, floods, in,...","[hundr, palestinian, evacu, home, sunday, morn...","[hundr_palestinian, palestinian_evacu, evacu_h...","[hundr_palestinian_evacu, palestinian_evacu_ho...","[hundr_palestinian_evacu_home, palestinian_eva...","[hundr, palestinian, flee, flood, gaza, israel...","[hundr_palestinian, palestinian_flee, flee_flo...","[hundr_palestinian_flee, palestinian_flee_floo...","[hundr_palestinian_flee_flood, palestinian_fle...",hundr palestinian flee flood gaza israel open dam,hundr palestinian evacu home sunday morn isra ...,hundr palestinian flee flood gaza israel open ...
1,0,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...","[fear, not, arachnophobes, ,, the, story, of, ...","[spider, burrowed, through, tourist, 's, stoma...","[fear, arachnophob, stori, bunburi, spiderman,...","[fear_arachnophob, arachnophob_stori, stori_bu...","[fear_arachnophob_stori, arachnophob_stori_bun...","[fear_arachnophob_stori_bunburi, arachnophob_s...","[spider, burrow, tourist, stomach, chest]","[spider_burrow, burrow_tourist, tourist_stomac...","[spider_burrow_tourist, burrow_tourist_stomach...","[spider_burrow_tourist_stomach, burrow_tourist...",spider burrow tourist stomach chest,fear arachnophob stori bunburi spiderman might...,spider burrow tourist stomach chest fear arach...
2,1,'Nasa Confirms Earth Will Experience 6 Days of...,Thousands of people have been duped by a fake ...,"[thousands, of, people, have, been, duped, by,...","['nasa, confirms, earth, will, experience, 6, ...","[thousand, peopl, dupe, fake, news, stori, cla...","[thousand_peopl, peopl_dupe, dupe_fake, fake_n...","[thousand_peopl_dupe, peopl_dupe_fake, dupe_fa...","[thousand_peopl_dupe_fake, peopl_dupe_fake_new...","[nasa, confirm, earth, experi, day, total, dar...","[nasa_confirm, confirm_earth, earth_experi, ex...","[nasa_confirm_earth, confirm_earth_experi, ear...","[nasa_confirm_earth_experi, confirm_earth_expe...",nasa confirm earth experi day total dark decem...,thousand peopl dupe fake news stori claim nasa...,nasa confirm earth experi day total dark decem...
3,1,Banksy 'Arrested & Real Identity Revealed' Is ...,If you’ve seen a story floating around on your...,"[if, you, ’, ve, seen, a, story, floating, aro...","[banksy, 'arrested, &, real, identity, reveale...","[seen, stori, float, around, facebook, feed, b...","[seen_stori, stori_float, float_around, around...","[seen_stori_float, stori_float_around, float_a...","[seen_stori_float_around, stori_float_around_f...","[banksi, arrest, real, ident, reveal, hoax, la...","[banksi_arrest, arrest_real, real_ident, ident...","[banksi_arrest_real, arrest_real_ident, real_i...","[banksi_arrest_real_ident, arrest_real_ident_r...",banksi arrest real ident reveal hoax last year,seen stori float around facebook feed banksi g...,banksi arrest real ident reveal hoax last year...
4,1,Woman detained in Lebanon is not al-Baghdadi's...,An Iraqi official denied that a woman detained...,"[an, iraqi, official, denied, that, a, woman, ...","[woman, detained, in, lebanon, is, not, al-bag...","[iraqi, offici, deni, woman, detain, lebanon, ...","[iraqi_offici, offici_deni, deni_woman, woman_...","[iraqi_offici_deni, offici_deni_woman, deni_wo...","[iraqi_offici_deni_woman, offici_deni_woman_de...","[woman, detain, lebanon, al, baghdadi, wife, i...","[woman_detain, detain_lebanon, lebanon_al, al_...","[woman_detain_lebanon, detain_lebanon_al, leba...","[woman_detain_lebanon_al, detain_lebanon_al_ba...",woman detain lebanon al baghdadi wife iraq say,iraqi offici deni woman detain lebanon wife ab...,woman detain lebanon al baghdadi wife iraq say...


## Scratchwork

In [26]:
tree_training_datasets[0][1]

Unnamed: 0,count_of_head_unigrams,count_of_unique_head_unigrams,ratio_of_unique_head_unigrams,count_of_head_bigrams,count_of_unique_head_bigrams,ratio_of_unique_head_bigrams,count_of_head_trigrams,count_of_unique_head_trigrams,ratio_of_unique_head_trigrams,count_of_body_unigrams,...,count_of_unique_body_trigrams,ratio_of_unique_body_trigrams,count_of_head_unigrams_in_body,ratio_of_head_unigrams_in_body,count_of_head_bigrams_in_body,ratio_of_head_bigrams_in_body,count_of_head_trigrams_in_body,ratio_of_head_trigrams_in_body,len_sent_head,len_sent_body
0,8,8,1.0,7,7,1.0,6,6,1.0,258,...,249,0.972656,7.0,0.875000,4.0,0.571429,1.0,0.166667,1,17
1,5,5,1.0,4,4,1.0,3,3,1.0,297,...,295,1.000000,3.0,0.600000,1.0,0.250000,0.0,0.000000,1,35
2,13,13,1.0,12,12,1.0,11,11,1.0,296,...,259,0.880952,13.0,1.000000,11.0,0.916667,9.0,0.818182,1,20
3,8,8,1.0,7,7,1.0,6,6,1.0,200,...,197,0.994949,5.0,0.625000,1.0,0.142857,0.0,0.000000,1,11
4,8,8,1.0,7,7,1.0,6,6,1.0,367,...,327,0.895890,7.0,0.875000,5.0,0.714286,3.0,0.500000,1,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3383,6,6,1.0,5,5,1.0,4,4,1.0,283,...,281,1.000000,6.0,1.000000,2.0,0.400000,0.0,0.000000,1,23
3384,8,8,1.0,7,7,1.0,6,6,1.0,127,...,125,1.000000,3.0,0.375000,0.0,0.000000,0.0,0.000000,1,10
3385,10,10,1.0,9,9,1.0,8,8,1.0,101,...,98,0.989899,4.0,0.400000,0.0,0.000000,0.0,0.000000,1,7
3386,10,10,1.0,9,9,1.0,8,8,1.0,158,...,156,1.000000,8.0,0.800000,2.0,0.222222,0.0,0.000000,1,22


## The Simple ML Magic

In [14]:
# Generate features + respective training sets
# simple test with 75% of the dataset used for training, 25% used for testing
partition = int(0.75 * len(training_data))

features = [CountFeatureGenerator(), TfidfFeatureGenerator()]

for feature in features:
    feature.process(training_data.copy())
    
tree_training_datasets = []
for feature in features:
    for training_dataset in feature.read():
        x_train = training_dataset[:partition]
        x_test = training_dataset[partition:]
        tree_training_datasets.append((feature._name, x_train, x_test))

y_train = training_data['Stance'].values[:partition]
y_test = training_data['Stance'].values[partition:]

## Accuracy & Results

In [27]:
# Run a GradientBoostingClassifier on each feature and plot their Precision-Recall Curves and note their f1 scores
for (feature, x_train, x_test) in tree_training_datasets:
    clf = GradientBoostingClassifier(n_estimators=300, max_depth=5).fit(x_train, y_train)
    guess = clf.predict(x_test)
    score = f1_score(y_test, guess)
    print(f"f1 score on {feature}: {score}")

f1 score on countFeatureGenerator: 0.9111570247933884
f1 score on tfidfFeatureGenerator: 0.8948211425520555
f1 score on tfidfFeatureGenerator: 0.8947087119187601
f1 score on tfidfFeatureGenerator: 0.8789873417721519


In [28]:
# Merge all tables into one giant table, and run a few ensemble methods on it
master_df = pd.DataFrame()
for (_, x_train, x_test) in tree_training_datasets:
    master_df

## Conclusions