In [1]:
# !pip install pandas
# !pip install gensim
# !pip install nltk
# !pip install sklearn
# !pip install numpy
# !pip install openpyxl

In [6]:
import pickle
import pandas as pd
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.cluster import KMeans

import re
import string

import warnings
warnings.simplefilter('ignore')

RANDOM_SEED = 42

# Data Cleaning

With our text data, we are going to apply some of the text pre-processing techniques. Since this cleaning process can go on forever. There's always an exception to every cleaning steps. So, we're going to do this process in a few rounds.

**Below are the steps we will be applying to our dataset:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words
* Stemming & Lemmatization


In [28]:
def clean_up(df):

    def clean_text(text):
        text = text.lower()
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\w*\d\w*', '', text)
        text = re.sub('[‘’“”—…]', '', text)
        text = re.sub('\n', '', text)
        return text

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(w) for w in text]

    data_clean = pd.DataFrame(df.apply(lambda x: clean_text(x)))

    stop_words = stopwords.words('english')
    data_clean = data_clean.iloc[:,0].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    data_clean = data_clean.apply(lambda x: x.encode('ascii', 'ignore').decode("ascii","ignore"))
    data_clean = data_clean.apply(lambda y: [x for x in word_tokenize(y)])
    data_clean = data_clean.apply(lemmatize_text)


    data_clean = pd.DataFrame(data_clean)

    return data_clean

### Cleaning data from DeepBlue for Bias Model

In [None]:
data_bias=pd.read_excel('assets/pb_spinde.xlsx')
data_bias=data_bias[['article','type']]
data_bias = data_bias.dropna().reset_index(drop=True)
data_bias['type'] = data_bias.type.replace({'center':0,'left':-1,'right':1})
data_bias_cleanned = clean_up(data_bias['article'])
data_bias['tokens'] = data_bias_cleanned['article']
data_bias

## Part 1a: Political Bias Modeling

First we want to build a model of political bias using features that will be available in our primary dataset. We'll import the Spinde political bias dataset and select the article text and bias rating columns. Then, we'll vectorize the article text and train the model.

In [29]:
pb_reduced = data_bias.copy()
pb_reduced

Unnamed: 0,article,type,tokens
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,0,"[youtube, say, deepfakes, birther, video, toug..."
1,"FRISCO, Texas — The increasingly bitter disput...",-1,"[frisco, texas, increasingly, bitter, dispute,..."
2,Speaking to the country for the first time fro...,-1,"[speaking, country, first, time, oval, office,..."
3,A professor who teaches climate change classes...,1,"[professor, teach, climate, change, class, sub..."
4,The left has a thing for taking babies hostage...,1,"[left, thing, taking, baby, hostage, perfect, ..."
...,...,...,...
1595,The House Democrats’ coronavirus recovery bill...,1,"[house, democrat, coronavirus, recovery, bill,..."
1596,There are many reasons that Republicans and co...,-1,"[many, reason, republican, conservative, activ..."
1597,A man’s penis becomes a female penis once a ma...,1,"[man, penis, becomes, female, penis, man, decl..."
1598,"As a self-described Democratic socialist, Sen....",1,"[selfdescribed, democratic, socialist, sen, be..."


In [None]:
#Now we'll train the Word2Vec model on our text tokens.
wv_mod = Word2Vec(pb_reduced['tokens'], seed = RANDOM_SEED)
wv_mod.save("models/pb_classifier_model.pkl") # models/pb_classifier_model.pkl

In [12]:
#We'll extract the vectors from the model...
vectors = wv_mod.wv
#...and since each word is a vector of 100 numbers, we'll take the mean of all word vectors in a given article 
#to represent the article as a whole
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in pb_reduced.tokens])

In [13]:
#Finally, we'll train a Random Forest classifier on the vectorized text to predict article bias.
X_train, X_test, y_train, y_test = train_test_split(vec_frame, pb_reduced.type, test_size=0.2, random_state=RANDOM_SEED)

In [14]:
clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train, y_train)
filename = "models/pb_classifier_model.pkl" #models/pb_classifier_model.pkl
pickle.dump(clf, open(filename, 'wb'))
clf.score(X_test, y_test)

0.79375

### Cleaning Data from Kaggle

In [None]:
data_news=pd.read_csv('assets/fn_kagg_train.csv') #assets/fn_kagg_train.csv
data_news = data_news.dropna().reset_index(drop=True)
data_news_cleanned = clean_up(data_news['text'])
data_news['text_tokens'] = data_news_cleanned['text']
data_news

## Part 1b: Applying the Model

Now, we want to predict the political bias of the target fake news dataset. We'll save these predictions as probabilities, which we'll use as additional features for clustering and trustworthiness prediction.

In [18]:
fn_kag_tok = data_news.copy()
fn_kag_tok['tmp'] = fn_kag_tok['text_tokens'].apply(lambda x: len(x))
fn_kag_tok = fn_kag_tok[fn_kag_tok['tmp']>30]
fn_kag_tok = fn_kag_tok.drop(columns='tmp')
fn_kag_tok

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage,text_tokens
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0,"[print, pay, back, money, plus, interest, enti..."
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0,"[attorney, general, loretta, lynch, plead, fif..."
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0,"[red, state, fox, news, sunday, reported, morn..."
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0,"[email, healthcare, reform, make, america, gre..."
5,Barracuda Brigade,2016-11-02T16:31:28.550+02:00,hillary goes absolutely berserk on protester a...,print hillary goes absolutely berserk she expl...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/11/Fu...,bias,Real,hillary goes absolutely berserk protester rall...,print hillary goes absolutely berserk explodes...,1.0,"[print, hillary, go, absolutely, berserk, expl..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2040,Matt Barber,2016-10-27T03:04:50.327+03:00,why never trumpers must reconsider,prof canoes reek of genocide white privilege c...,english,wnd.com,No Image URL,bias,Real,trump vs clinton risk vs disaster,check hillarythemed haunted house anticlinton ...,0.0,"[prof, canoe, reek, genocide, white, privilege..."
2041,Jane Chastain,2016-10-27T03:04:50.704+03:00,election crossroads socialism or capitalism,teens walk free after gangrape conviction judg...,english,wnd.com,No Image URL,bias,Real,gingrich slutshames megyn kelly,good samaritan wearing indian headdress disarm...,1.0,"[teen, walk, free, gangrape, conviction, judge..."
2042,Michael Brown,2016-10-27T03:04:54.788+03:00,reasons ill vote for trump,school named for munichmassacre mastermind ter...,english,wnd.com,http://mobile.wnd.com/files/2011/12/leftfield3...,bias,Real,youtube bans clintons black son,skype sex scam fortune built shame moroccan bo...,1.0,"[school, named, munichmassacre, mastermind, te..."
2043,Ann Coulter,2016-10-27T03:05:01.989+03:00,our new country women and minorities hit hardest,wars and rumors of wars russia unveils satan ...,english,wnd.com,http://www.wnd.com/files/2016/10/danney-willll...,bias,Real,wikileaks bombshells hillary need know,posted eddie skyhigh potency may scare away cr...,1.0,"[war, rumor, war, russia, unveils, satan, miss..."


In [19]:
#Now we'll apply the Word2Vec model we generated above to our tokens to vectorize the text.
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in fn_kag_tok.text_tokens])

In [20]:
#Now we apply the Random Forest classifier to our vectorized text and save out the predicted probabilities.
preds = pd.DataFrame(clf.predict_proba(vec_frame), columns=['dem_bias','neutral','rep_bias'])

In [21]:
#Finally, we'll rejoin the predictions to the original dataset.
fn_kag_reduced = fn_kag_tok.copy().reset_index(drop=True)
fn_kag_reduced = fn_kag_reduced.join(preds)
fn_kag_reduced

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage,text_tokens,dem_bias,neutral,rep_bias
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0,"[print, pay, back, money, plus, interest, enti...",0.370000,0.14,0.490000
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0,"[attorney, general, loretta, lynch, plead, fif...",0.200000,0.19,0.610000
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0,"[red, state, fox, news, sunday, reported, morn...",0.497333,0.06,0.442667
3,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0,"[email, healthcare, reform, make, america, gre...",0.400000,0.10,0.500000
4,Barracuda Brigade,2016-11-02T16:31:28.550+02:00,hillary goes absolutely berserk on protester a...,print hillary goes absolutely berserk she expl...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/11/Fu...,bias,Real,hillary goes absolutely berserk protester rall...,print hillary goes absolutely berserk explodes...,1.0,"[print, hillary, go, absolutely, berserk, expl...",0.530000,0.07,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1727,Matt Barber,2016-10-27T03:04:50.327+03:00,why never trumpers must reconsider,prof canoes reek of genocide white privilege c...,english,wnd.com,No Image URL,bias,Real,trump vs clinton risk vs disaster,check hillarythemed haunted house anticlinton ...,0.0,"[prof, canoe, reek, genocide, white, privilege...",0.570000,0.12,0.310000
1728,Jane Chastain,2016-10-27T03:04:50.704+03:00,election crossroads socialism or capitalism,teens walk free after gangrape conviction judg...,english,wnd.com,No Image URL,bias,Real,gingrich slutshames megyn kelly,good samaritan wearing indian headdress disarm...,1.0,"[teen, walk, free, gangrape, conviction, judge...",0.350000,0.25,0.400000
1729,Michael Brown,2016-10-27T03:04:54.788+03:00,reasons ill vote for trump,school named for munichmassacre mastermind ter...,english,wnd.com,http://mobile.wnd.com/files/2011/12/leftfield3...,bias,Real,youtube bans clintons black son,skype sex scam fortune built shame moroccan bo...,1.0,"[school, named, munichmassacre, mastermind, te...",0.260000,0.19,0.550000
1730,Ann Coulter,2016-10-27T03:05:01.989+03:00,our new country women and minorities hit hardest,wars and rumors of wars russia unveils satan ...,english,wnd.com,http://www.wnd.com/files/2016/10/danney-willll...,bias,Real,wikileaks bombshells hillary need know,posted eddie skyhigh potency may scare away cr...,1.0,"[war, rumor, war, russia, unveils, satan, miss...",0.270000,0.45,0.280000


## Part 2: Clustering
Once we have all the features we want, we'll do unsupervised clustering. Ideally we'd want to do some evaluations to find an ideal number of clusters, but for now we'll just go with 4.

We'll need to re-vectorize the text, as the political bias vectors won't work here. Also, we'd probably want to vectorize both headline and article body, but for now I'll just vectorize the article body.

In [22]:
#From this point on, we should be concerned with data leakage. Everything prior to now could in theory be applied to live data. We'll go ahead and split the data out into train and test sets.

X_train, X_test, y_train, y_test = train_test_split(fn_kag_reduced.drop(columns=['label']), fn_kag_reduced.label, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
#Since we already have the tokenized text from above, we can just go ahead and train the new Word2Vec model on those tokens.
wv_mod = Word2Vec(X_train['text_tokens'], seed = RANDOM_SEED)
wv_mod.save("models/fn_w2v_model.pkl") #models/fn_w2v_model.pkl

In [24]:
#Again we'll extract and average the word vectors.
vectors = wv_mod.wv
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in X_train.text_tokens])

In [None]:
#We'll join the new word vectors with the bias estimates we generate above.
X_train_all = vec_frame.join(X_train).drop(columns=['id','title','author','text','text_tokens'])

In [None]:
#Finally we'll build our clustering model...
cls = KMeans(4, random_state=RANDOM_SEED).fit(X_train_all)
filename = "models/cluster_mod.pkl"
pickle.dump(cls, open(filename, 'wb'))

In [None]:
#...and add the predicted clusters back into the vector dataframe.
X_train_all['cluster'] = cls.predict(X_train_all)
X_train_all

## Part 3: Supervised Learning

Now that we have all of our features and clusters, and article body text is already vectorized, we can train a classifier to predict whether a given article is misinformation or not.

In [38]:
#We need to apply the vectorization and clustering from above to the test data.
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in X_test.text_tokens])
X_test_all = vec_frame.join(X_test).drop(columns=['id','title','author','text','text_tokens'])
X_test_all['cluster'] = cls.predict(X_test_all)
X_test_all

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,dem_bias,neutral,rep_bias,cluster
0,-0.004069,0.006721,0.025754,0.016396,-0.024229,0.014822,-0.005376,0.004720,-0.014789,-0.061167,...,0.000795,0.015933,0.034293,0.010748,-0.005926,0.024244,0.54,0.08,0.38,2
1,0.007615,0.021091,0.009161,0.035196,-0.026442,0.019393,0.025404,0.076108,-0.016611,-0.025038,...,0.039519,0.040006,0.030866,0.032353,-0.008771,-0.000636,0.59,0.02,0.39,2
2,-0.015921,-0.017781,0.027126,0.039706,-0.072932,-0.000181,0.049133,-0.028249,0.002583,-0.090377,...,0.006657,0.026980,-0.000872,0.036489,0.011354,-0.001185,0.29,0.30,0.41,1
3,0.026510,0.011171,-0.015315,0.002247,0.002525,0.007189,-0.045030,0.004801,-0.010983,-0.036082,...,0.050313,0.056651,-0.007929,0.030120,-0.000486,0.025396,0.32,0.18,0.50,1
4,0.030441,0.015128,0.020871,0.011121,-0.068780,0.009686,0.026041,0.025270,0.028207,-0.053540,...,0.010415,0.031614,0.007730,0.036058,0.023472,0.025822,0.48,0.07,0.45,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3494,0.000622,-0.015777,0.001780,0.022281,-0.019934,0.022051,0.005040,0.027889,-0.018690,-0.037581,...,0.043006,0.008060,0.006654,0.009844,-0.018182,0.003670,0.54,0.06,0.40,2
3495,-0.003556,-0.001852,0.027409,0.018975,-0.044493,-0.012646,-0.015225,-0.051780,-0.019676,-0.039484,...,0.017797,0.033542,0.000850,-0.028908,-0.022689,-0.023097,0.12,0.28,0.60,1
3496,-0.005016,-0.026904,-0.028986,-0.009276,-0.035803,-0.053517,0.012711,-0.000715,-0.050154,-0.062212,...,0.025777,-0.008230,-0.005290,0.012638,-0.013910,-0.047990,0.13,0.24,0.63,1
3497,-0.003582,-0.017635,0.022182,0.072596,-0.027446,0.024533,-0.008718,0.059137,-0.027213,-0.041577,...,0.051082,0.046678,0.002067,0.037541,-0.034289,-0.009565,0.56,0.06,0.38,2


In [39]:
clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train_all, y_train)
filename = "models/fn_classifier_model.pkl"
pickle.dump(clf, open(filename, 'wb'))
clf.score(X_test_all, y_test)

0.8939697056301801

## Part 4: Export CSV

So now we have 2 datasets (3 if you count the unused 'test' set from the Kaggle fake news data). We want to have 1 large dataset to power the dashboard. Let's load each dataset into a dataframe, vectorize the text, predict a cluster, and predict whether it's misinformation or not. We'll start fresh to keep things simple, and we'll just use article body text.

This part can really stand on its own... should probably pull it out into a separate notebook/script? Only issue is we'd have to repeat the code for the NLP steps, unless we save them into their own script as well.

In [40]:
#Load each datafile we want to process
data_bias=pd.read_excel('assets/pb_spinde.xlsx')
data_news_1=pd.read_csv('assets/fn_kagg_train.csv')
data_news_2=pd.read_csv('assets/fn_kagg_test.csv')

In [41]:
#Load all of our models
pb_vec_mod = Word2Vec.load('models/pb_w2v_model.pkl')
fn_vec_mod = Word2Vec.load('models/fn_w2v_model.pkl')
cls = pickle.load(open('models/cluster_mod.pkl', 'rb'))
pb_clf = pickle.load(open('models/pb_classifier_model.pkl', 'rb'))
fn_clf = pickle.load(open('models/fn_classifier_model.pkl', 'rb'))

In [42]:
#Grab just the article text, drop empty cells, and stack the dataframes.
db_text = data_bias.article
dn1_text = data_news_1.text
dn2_text = data_news_2.text
full_data = pd.DataFrame(pd.concat([db_text, dn1_text, dn2_text], axis = 0).dropna().reset_index(drop=True))
full_data.columns = ['full_text']
full_data

Unnamed: 0,full_text
0,YouTube says no ‘deepfakes’ or ‘birther’ video...
1,"FRISCO, Texas — The increasingly bitter disput..."
2,Speaking to the country for the first time fro...
3,A professor who teaches climate change classes...
4,The left has a thing for taking babies hostage...
...,...
27549,Of all the dysfunctions that plague the world’...
27550,WASHINGTON — Gov. John Kasich of Ohio on Tu...
27551,Good morning. (Want to get California Today by...
27552,« Previous - Next » 300 US Marines To Be Deplo...


In [43]:
#Now we'll apply all the NLP steps from above to the full series. This can take a while. I'm skipping the lemmatization and stemming for now...
# full_data_clean = full_data.copy()
# full_data_clean['processing'] = full_data_clean.full_text.apply(lambda x: x.encode('ascii', 'ignore').decode("ascii","ignore"))
# full_data_clean['processing'] = full_data_clean.processing.apply(round1)
# full_data_clean['processing'] = full_data_clean.processing.apply(round2)
# full_data_clean['processing'] = full_data_clean.processing.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# full_data_clean['tokens'] = full_data_clean.processing.apply(lambda y: [x for x in word_tokenize(y)])

# full_data_clean['tmp'] = full_data_clean['tokens'].apply(lambda x: len(x))
# full_data_clean = full_data_clean[full_data_clean['tmp']>30]
# full_data_clean = full_data_clean.drop(columns=['tmp','processing'])

# full_data_clean.to_pickle('clean_full_data.pkl')
full_data_clean = pd.read_pickle('clean_full_data.pkl').reset_index(drop=True)
full_data_clean

Unnamed: 0,full_text,tokens
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,"[youtube, says, deepfakes, birther, videos, to..."
1,"FRISCO, Texas — The increasingly bitter disput...","[frisco, texas, increasingly, bitter, dispute,..."
2,Speaking to the country for the first time fro...,"[speaking, country, first, time, oval, office,..."
3,A professor who teaches climate change classes...,"[professor, teaches, climate, change, classes,..."
4,The left has a thing for taking babies hostage...,"[left, thing, taking, babies, hostage, perfect..."
...,...,...
25834,Of all the dysfunctions that plague the world’...,"[dysfunctions, plague, worlds, megacities, non..."
25835,WASHINGTON — Gov. John Kasich of Ohio on Tu...,"[washington, gov, john, kasich, ohio, tuesday,..."
25836,Good morning. (Want to get California Today by...,"[good, morning, want, get, california, today, ..."
25837,« Previous - Next » 300 US Marines To Be Deplo...,"[previous, next, us, marines, deployed, russia..."


In [44]:
#Now we'll use the pb vectorizer and generate political bias predictions for the whole dataset.
vectors = pb_vec_mod.wv
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in full_data_clean.tokens])
preds = pd.DataFrame(pb_clf.predict_proba(vec_frame), columns=['dem_bias','neutral','rep_bias'])
full_data_clean = full_data_clean.join(preds)
full_data_clean

Unnamed: 0,full_text,tokens,dem_bias,neutral,rep_bias
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,"[youtube, says, deepfakes, birther, videos, to...",0.350000,0.33,0.320000
1,"FRISCO, Texas — The increasingly bitter disput...","[frisco, texas, increasingly, bitter, dispute,...",0.490000,0.12,0.390000
2,Speaking to the country for the first time fro...,"[speaking, country, first, time, oval, office,...",0.730000,0.08,0.190000
3,A professor who teaches climate change classes...,"[professor, teaches, climate, change, classes,...",0.520000,0.02,0.460000
4,The left has a thing for taking babies hostage...,"[left, thing, taking, babies, hostage, perfect...",0.140000,0.01,0.850000
...,...,...,...,...,...
25834,Of all the dysfunctions that plague the world’...,"[dysfunctions, plague, worlds, megacities, non...",0.310000,0.06,0.630000
25835,WASHINGTON — Gov. John Kasich of Ohio on Tu...,"[washington, gov, john, kasich, ohio, tuesday,...",0.320000,0.38,0.300000
25836,Good morning. (Want to get California Today by...,"[good, morning, want, get, california, today, ...",0.190000,0.18,0.630000
25837,« Previous - Next » 300 US Marines To Be Deplo...,"[previous, next, us, marines, deployed, russia...",0.326923,0.05,0.623077


In [45]:
#Now we'll predict clusters using the fake news w2v model.
vectors = fn_vec_mod.wv
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in full_data_clean.tokens])
cluster_frame = vec_frame.join(full_data_clean).drop(columns=['full_text','tokens'])
full_data_clean['cluster'] = cls.predict(cluster_frame)
full_data_clean

Unnamed: 0,full_text,tokens,dem_bias,neutral,rep_bias,cluster
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,"[youtube, says, deepfakes, birther, videos, to...",0.350000,0.33,0.320000,1
1,"FRISCO, Texas — The increasingly bitter disput...","[frisco, texas, increasingly, bitter, dispute,...",0.490000,0.12,0.390000,2
2,Speaking to the country for the first time fro...,"[speaking, country, first, time, oval, office,...",0.730000,0.08,0.190000,2
3,A professor who teaches climate change classes...,"[professor, teaches, climate, change, classes,...",0.520000,0.02,0.460000,2
4,The left has a thing for taking babies hostage...,"[left, thing, taking, babies, hostage, perfect...",0.140000,0.01,0.850000,0
...,...,...,...,...,...,...
25834,Of all the dysfunctions that plague the world’...,"[dysfunctions, plague, worlds, megacities, non...",0.310000,0.06,0.630000,0
25835,WASHINGTON — Gov. John Kasich of Ohio on Tu...,"[washington, gov, john, kasich, ohio, tuesday,...",0.320000,0.38,0.300000,1
25836,Good morning. (Want to get California Today by...,"[good, morning, want, get, california, today, ...",0.190000,0.18,0.630000,0
25837,« Previous - Next » 300 US Marines To Be Deplo...,"[previous, next, us, marines, deployed, russia...",0.326923,0.05,0.623077,0


In [49]:
#Finally, we'll generate predicted fake news probabilities using our fn classifier.
class_frame = vec_frame.join(full_data_clean).drop(columns=['full_text','tokens'])
preds = pd.DataFrame(fn_clf.predict_proba(class_frame), columns=['not_misinfo', 'misinfo'])
full_data_clean = full_data_clean.join(preds)
full_data_clean


Unnamed: 0,full_text,tokens,dem_bias,neutral,rep_bias,cluster,not_misinfo,misinfo
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,"[youtube, says, deepfakes, birther, videos, to...",0.350000,0.33,0.320000,1,0.55,0.45
1,"FRISCO, Texas — The increasingly bitter disput...","[frisco, texas, increasingly, bitter, dispute,...",0.490000,0.12,0.390000,2,0.98,0.02
2,Speaking to the country for the first time fro...,"[speaking, country, first, time, oval, office,...",0.730000,0.08,0.190000,2,0.83,0.17
3,A professor who teaches climate change classes...,"[professor, teaches, climate, change, classes,...",0.520000,0.02,0.460000,2,0.48,0.52
4,The left has a thing for taking babies hostage...,"[left, thing, taking, babies, hostage, perfect...",0.140000,0.01,0.850000,0,0.27,0.73
...,...,...,...,...,...,...,...,...
25834,Of all the dysfunctions that plague the world’...,"[dysfunctions, plague, worlds, megacities, non...",0.310000,0.06,0.630000,0,0.75,0.25
25835,WASHINGTON — Gov. John Kasich of Ohio on Tu...,"[washington, gov, john, kasich, ohio, tuesday,...",0.320000,0.38,0.300000,1,0.93,0.07
25836,Good morning. (Want to get California Today by...,"[good, morning, want, get, california, today, ...",0.190000,0.18,0.630000,0,0.93,0.07
25837,« Previous - Next » 300 US Marines To Be Deplo...,"[previous, next, us, marines, deployed, russia...",0.326923,0.05,0.623077,0,0.16,0.84


In [51]:
full_data_clean.to_csv("assets/all_predictions.csv")

## To-Do:
1. Put all NLP stuff in a function, so that we can just pass a column of text and get the clean tokens out.
2. (Maybe?) Clean the notebook up into two scripts: one that builds and pickles the models, and one that takes a csv (or set of csvs/excels/whatever), runs all the models, and outputs a csv with all the data needed for the dashboard.
3. ??Include headline text in models
4. Do some deeper evaluations on the clustering and supervised learning portions (supervised portion is not critical, we have good accuracy right now).