In [1]:
# !pip install pandas
# !pip install gensim
# !pip install nltk
# !pip install sklearn
# !pip install numpy
# !pip install openpyxl
# !pip install scipy

In [2]:
import pickle
import pandas as pd
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from scipy.sparse import hstack

import numpy as np
from sklearn.cluster import KMeans

import re
import string

import warnings
warnings.simplefilter('ignore')

RANDOM_SEED = 42

# Data Cleaning

With our text data, we are going to apply some of the text pre-processing techniques. Since this cleaning process can go on forever. There's always an exception to every cleaning steps. So, we're going to do this process in a few rounds.

**Below are the steps we will be applying to our dataset:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words
* Stemming & Lemmatization


In [3]:
def clean_up(df):

    def clean_text(text):
        text = text.lower()
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\w*\d\w*', '', text)
        text = re.sub('[‘’“”—…]', '', text)
        text = re.sub('\n', '', text)
        return text

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(w) for w in text]

    data_clean = pd.DataFrame(df.apply(lambda x: clean_text(x)))

    stop_words = stopwords.words('english')
    data_clean = data_clean.iloc[:,0].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    data_clean = data_clean.apply(lambda x: x.encode('ascii', 'ignore').decode("ascii","ignore"))
    data_clean = data_clean.apply(lambda y: [x for x in word_tokenize(y)])
    data_clean = data_clean.apply(lemmatize_text)


    data_clean = pd.DataFrame(data_clean)

    return data_clean

### Cleaning data from DeepBlue for Bias Model

In [4]:
data_bias=pd.read_excel('assets/pb_spinde.xlsx')
data_bias=data_bias[['article','type']]
data_bias = data_bias.dropna().reset_index(drop=True)
data_bias['type'] = data_bias.type.replace({'center':0,'left':-1,'right':1})
data_bias_cleanned = clean_up(data_bias['article'])
data_bias['tokens'] = data_bias_cleanned['article']
data_bias

Unnamed: 0,article,type,tokens
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,0,"[youtube, say, deepfakes, birther, video, toug..."
1,"FRISCO, Texas — The increasingly bitter disput...",-1,"[frisco, texas, increasingly, bitter, dispute,..."
2,Speaking to the country for the first time fro...,-1,"[speaking, country, first, time, oval, office,..."
3,A professor who teaches climate change classes...,1,"[professor, teach, climate, change, class, sub..."
4,The left has a thing for taking babies hostage...,1,"[left, thing, taking, baby, hostage, perfect, ..."
...,...,...,...
1595,The House Democrats’ coronavirus recovery bill...,1,"[house, democrat, coronavirus, recovery, bill,..."
1596,There are many reasons that Republicans and co...,-1,"[many, reason, republican, conservative, activ..."
1597,A man’s penis becomes a female penis once a ma...,1,"[man, penis, becomes, female, penis, man, decl..."
1598,"As a self-described Democratic socialist, Sen....",1,"[selfdescribed, democratic, socialist, sen, be..."


## Part 1a: Political Bias Modeling

First we want to build a model of political bias using features that will be available in our primary dataset. We'll import the Spinde political bias dataset and select the article text and bias rating columns. Then, we'll vectorize the article text and train the model.

In [5]:
pb_reduced = data_bias.copy()
pb_reduced

Unnamed: 0,article,type,tokens
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,0,"[youtube, say, deepfakes, birther, video, toug..."
1,"FRISCO, Texas — The increasingly bitter disput...",-1,"[frisco, texas, increasingly, bitter, dispute,..."
2,Speaking to the country for the first time fro...,-1,"[speaking, country, first, time, oval, office,..."
3,A professor who teaches climate change classes...,1,"[professor, teach, climate, change, class, sub..."
4,The left has a thing for taking babies hostage...,1,"[left, thing, taking, baby, hostage, perfect, ..."
...,...,...,...
1595,The House Democrats’ coronavirus recovery bill...,1,"[house, democrat, coronavirus, recovery, bill,..."
1596,There are many reasons that Republicans and co...,-1,"[many, reason, republican, conservative, activ..."
1597,A man’s penis becomes a female penis once a ma...,1,"[man, penis, becomes, female, penis, man, decl..."
1598,"As a self-described Democratic socialist, Sen....",1,"[selfdescribed, democratic, socialist, sen, be..."


In [6]:
#Now we'll train the Word2Vec model on our text tokens.
wv_mod = Word2Vec(pb_reduced['tokens'], seed = RANDOM_SEED)
wv_mod.save("models/pb_w2v_model.pkl") # models/pb_classifier_model.pkl
#We'll extract the vectors from the model...
vectors = wv_mod.wv
#...and since each word is a vector of 100 numbers, we'll take the mean of all word vectors in a given article 
#to represent the article as a whole
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in pb_reduced.tokens])

In [7]:
##This is an alternate vectorization, if we wanted to do TF-IDF. We'll not use it here because it didn't really impact performance, but we'll use it for the fake news portion.
# vec = TfidfVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(1,2))
# vec_frame = vec.fit_transform(pb_reduced.article)
# vec_frame

In [8]:
#Finally, we'll train a Random Forest classifier on the vectorized text to predict article bias.
X_train, X_test, y_train, y_test = train_test_split(vec_frame, pb_reduced.type, test_size=0.2, random_state=RANDOM_SEED)

In [9]:
clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train, y_train)
filename = "models/pb_classifier_model.pkl" #models/pb_classifier_model.pkl
pickle.dump(clf, open(filename, 'wb'))
clf.score(X_test, y_test)

0.775

### Cleaning Data from Kaggle

In [10]:
#This was the old fake news dataset; I'll leave the line in for archiving purposes but we've switched to the new one below.
#data_news=pd.read_csv('assets/fn_kagg_train.csv') #assets/fn_kagg_train.csv

##Note: sklearn's TF-IDF vectorizor automatically does stemming and tokenization and I can't get it to work on pre-tokenized text, so we won't actually be using these here. 
#However, I'll leave this in for now
data_news=pd.read_csv('assets/fn_sb.csv') #assets/fn_kagg_train.csv
data_news = data_news.dropna().reset_index(drop=True)
data_text_cleaned = clean_up(data_news['text'])
data_title_cleaned = clean_up(data_news['title'])
data_news['text_tokens'] = data_text_cleaned['text']
data_news['title_tokens'] = data_title_cleaned['title']
data_news = data_news[['title','title_tokens','text','text_tokens','label']]
data_news

Unnamed: 0,title,title_tokens,text,text_tokens,label
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real
...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real


## Part 1b: Applying the Model

Now, we want to predict the political bias of the target fake news dataset. We'll save these predictions as probabilities, which we'll use as additional features for clustering and trustworthiness prediction.

In [11]:
fn_kag_tok = data_news.copy()
##Removing the article length requirement, as Ray suggested
# fn_kag_tok['tmp'] = fn_kag_tok['text_tokens'].apply(lambda x: len(x))
# fn_kag_tok = fn_kag_tok[fn_kag_tok['tmp']>30]
# fn_kag_tok = fn_kag_tok.drop(columns='tmp').reset_index(drop=True)
fn_kag_tok

Unnamed: 0,title,title_tokens,text,text_tokens,label
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real
...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real


In [12]:
#Now we'll apply the Word2Vec model we generated above to our tokens to vectorize the text.
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in fn_kag_tok.text_tokens])

In [13]:
#Now we apply the Random Forest classifier to our vectorized text and save out the predicted probabilities.
preds = pd.DataFrame(clf.predict_proba(vec_frame), columns=['dem_bias','neutral','rep_bias'])

In [14]:
#Finally, we'll rejoin the predictions to the original dataset.
fn_kag_reduced = fn_kag_tok.copy().reset_index(drop=True)
fn_kag_reduced = fn_kag_reduced.join(preds)
fn_kag_reduced

Unnamed: 0,title,title_tokens,text,text_tokens,label,dem_bias,neutral,rep_bias
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real,0.440000,0.17,0.390000
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real,0.240000,0.31,0.450000
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real,0.490000,0.07,0.440000
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real,0.369167,0.04,0.590833
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real,0.410000,0.06,0.530000
...,...,...,...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real,0.450000,0.12,0.430000
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real,0.360000,0.19,0.450000
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real,0.310000,0.22,0.470000
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real,0.280000,0.30,0.420000


## Part 2: Clustering
Once we have all the features we want, we'll do unsupervised clustering. Ideally we'd want to do some evaluations to find an ideal number of clusters, but for now we'll just go with 4.

We'll need to re-vectorize the text, as the political bias vectors won't work here. Also, we'd probably want to vectorize both headline and article body, but for now I'll just vectorize the article body.

In [15]:
#From this point on, we should be concerned with data leakage. Everything prior to now could in theory be applied to live data. We'll go ahead and split the data out into train and test sets.

X_train, X_test, y_train, y_test = train_test_split(fn_kag_reduced.drop(columns=['label']), fn_kag_reduced.label, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train

Unnamed: 0,title,title_tokens,text,text_tokens,dem_bias,neutral,rep_bias
0,comment on texas police arrest trump voter for...,"[comment, texas, police, arrest, trump, voter,...",print this entry was posted in uncategorized ...,"[print, entry, posted, uncategorized, bookmark...",0.410000,0.04,0.550000
1,clinton cronies steered millions to foundation...,"[clinton, crony, steered, million, foundation,...",email \n\nrepublican presidential candidate do...,"[email, republican, presidential, candidate, d...",0.430000,0.18,0.390000
2,kilo richtig feines zeug polizei stellt exzell...,"[kilo, richtig, feines, zeug, polizei, stellt,...",ich leide unter platzangst luftballon plaudert...,"[ich, leide, unter, platzangst, luftballon, pl...",0.423778,0.14,0.436222
3,the end game closes in on the clintons as the ...,"[end, game, close, clinton, deep, state, turn,...",the new clinton email scandal keeps getting wo...,"[new, clinton, email, scandal, keep, getting, ...",0.290000,0.13,0.580000
4,comment on breaking podesta told mills dump al...,"[comment, breaking, podesta, told, mill, dump,...",i think megan kelly has screwed everyone neces...,"[think, megan, kelly, screwed, everyone, neces...",0.404221,0.07,0.525779
...,...,...,...,...,...,...,...
1631,potus flotus pocus,"[potus, flotus, pocus]",potentially the worst vp in american history a...,"[potentially, worst, vp, american, history, lo...",0.300000,0.16,0.540000
1632,meet the journalist facing years in jail for ...,"[meet, journalist, facing, year, jail, filming...",experimental gm crops from dow chemical and du...,"[experimental, gm, crop, dow, chemical, dupont...",0.640000,0.04,0.320000
1633,chart of the day mind the russell epic breakdo...,"[chart, day, mind, russell, epic, breakdown, u...",new home sales tightrope walk over a windy ca...,"[new, home, sale, tightrope, walk, windy, cany...",0.100000,0.33,0.570000
1634,email pandemonium and the perpetuation of rape...,"[email, pandemonium, perpetuation, rape, culture]",politics fbi director james comey afp file pho...,"[politics, fbi, director, james, comey, afp, f...",0.180000,0.44,0.380000


In [16]:
##OLD CODE with W2V vectorization!
#Since we already have the tokenized text from above, we can just go ahead and train the new Word2Vec model on those tokens.
# wv_mod = Word2Vec(X_train['text_tokens'], seed = RANDOM_SEED)
# wv_mod.save("models/fn_w2v_model.pkl") #models/fn_w2v_model.pkl
# #Again we'll extract and average the word vectors.
# vectors = wv_mod.wv
# vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in X_train.text_tokens])
# X_train_all = vec_frame.join(X_train.drop(columns=['text','text_tokens']))

In [17]:
#New code with TF-IDF vectorization.
title_vec_mod = TfidfVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(1,2))
text_vec_mod = TfidfVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(1,2))

title_vecs = title_vec_mod.fit_transform(X_train.title)
text_vecs = text_vec_mod.fit_transform(X_train.text)

filename = "models/fn_title_tfidf_model.pkl" #models/pb_classifier_model.pkl
pickle.dump(title_vec_mod, open(filename, 'wb'))
filename = "models/fn_text_tfidf_model.pkl" #models/pb_classifier_model.pkl
pickle.dump(text_vec_mod, open(filename, 'wb'))

#We'll join the new word vectors with the bias estimates we generate above.
X_train_sparse = hstack((title_vecs,text_vecs,X_train.drop(columns=['title','title_tokens','text','text_tokens'])))
X_train_sparse

<1636x376530 sparse matrix of type '<class 'numpy.float64'>'
	with 717063 stored elements in COOrdinate format>

In [18]:
#Finally we'll build our clustering model...
cls = KMeans(4, random_state=RANDOM_SEED).fit(X_train_sparse)
filename = "models/cluster_mod.pkl"
pickle.dump(cls, open(filename, 'wb'))

In [19]:
#...and add the predicted clusters back into the vector dataframe.
X_train['cluster'] = cls.predict(X_train_sparse)
X_train_sparse = hstack((X_train_sparse,pd.DataFrame(X_train['cluster'])))
X_train

Unnamed: 0,title,title_tokens,text,text_tokens,dem_bias,neutral,rep_bias,cluster
0,comment on texas police arrest trump voter for...,"[comment, texas, police, arrest, trump, voter,...",print this entry was posted in uncategorized ...,"[print, entry, posted, uncategorized, bookmark...",0.410000,0.04,0.550000,0
1,clinton cronies steered millions to foundation...,"[clinton, crony, steered, million, foundation,...",email \n\nrepublican presidential candidate do...,"[email, republican, presidential, candidate, d...",0.430000,0.18,0.390000,3
2,kilo richtig feines zeug polizei stellt exzell...,"[kilo, richtig, feines, zeug, polizei, stellt,...",ich leide unter platzangst luftballon plaudert...,"[ich, leide, unter, platzangst, luftballon, pl...",0.423778,0.14,0.436222,3
3,the end game closes in on the clintons as the ...,"[end, game, close, clinton, deep, state, turn,...",the new clinton email scandal keeps getting wo...,"[new, clinton, email, scandal, keep, getting, ...",0.290000,0.13,0.580000,0
4,comment on breaking podesta told mills dump al...,"[comment, breaking, podesta, told, mill, dump,...",i think megan kelly has screwed everyone neces...,"[think, megan, kelly, screwed, everyone, neces...",0.404221,0.07,0.525779,0
...,...,...,...,...,...,...,...,...
1631,potus flotus pocus,"[potus, flotus, pocus]",potentially the worst vp in american history a...,"[potentially, worst, vp, american, history, lo...",0.300000,0.16,0.540000,2
1632,meet the journalist facing years in jail for ...,"[meet, journalist, facing, year, jail, filming...",experimental gm crops from dow chemical and du...,"[experimental, gm, crop, dow, chemical, dupont...",0.640000,0.04,0.320000,3
1633,chart of the day mind the russell epic breakdo...,"[chart, day, mind, russell, epic, breakdown, u...",new home sales tightrope walk over a windy ca...,"[new, home, sale, tightrope, walk, windy, cany...",0.100000,0.33,0.570000,2
1634,email pandemonium and the perpetuation of rape...,"[email, pandemonium, perpetuation, rape, culture]",politics fbi director james comey afp file pho...,"[politics, fbi, director, james, comey, afp, f...",0.180000,0.44,0.380000,2


In [20]:
X_train_sparse

<1636x376531 sparse matrix of type '<class 'numpy.float64'>'
	with 718217 stored elements in COOrdinate format>

## Part 3: Supervised Learning

Now that we have all of our features and clusters, and article body text is already vectorized, we can train a classifier to predict whether a given article is misinformation or not.

In [21]:
##OLD CODE with W2V vectorization!
#We need to apply the vectorization and clustering from above to the test data.
# vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in X_test.text_tokens])
# X_test_all = vec_frame.join(X_test).drop(columns=['text','text_tokens'])
# X_test_all['cluster'] = cls.predict(X_test_all)
# X_test_all

In [22]:
#New Code with TF-IDF Vectorization
test_title_vecs = title_vec_mod.transform(X_test.title)
test_text_vecs = text_vec_mod.transform(X_test.text)

X_test_sparse = hstack((test_title_vecs,test_text_vecs,X_test.drop(columns=['title','title_tokens','text','text_tokens'])))
X_test['cluster'] = cls.predict(X_test_sparse)
X_test_sparse = hstack((X_test_sparse,pd.DataFrame(X_test['cluster'])))

In [23]:
clf = AdaBoostClassifier(random_state=RANDOM_SEED)
clf.fit(X_train_sparse, y_train)
filename = "models/fn_classifier_model.pkl"
pickle.dump(clf, open(filename, 'wb'))
clf.score(X_test_sparse, y_test)

0.8092909535452323

## Part 4: Export CSV

In [41]:
fn_export = fn_kag_reduced.copy()
fn_export

Unnamed: 0,title,title_tokens,text,text_tokens,label,dem_bias,neutral,rep_bias
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real,0.440000,0.17,0.390000
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real,0.240000,0.31,0.450000
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real,0.490000,0.07,0.440000
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real,0.369167,0.04,0.590833
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real,0.410000,0.06,0.530000
...,...,...,...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real,0.450000,0.12,0.430000
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real,0.360000,0.19,0.450000
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real,0.310000,0.22,0.470000
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real,0.280000,0.30,0.420000


In [42]:
export_title_vecs = title_vec_mod.transform(fn_export.title)
export_text_vecs = text_vec_mod.transform(fn_export.text)

export_sparse = hstack((export_title_vecs,export_text_vecs,fn_export.drop(columns=['label','title','title_tokens','text','text_tokens'])))
fn_export['cluster'] = cls.predict(export_sparse)
export_sparse = hstack((export_sparse,pd.DataFrame(fn_export['cluster'])))
fn_export

Unnamed: 0,title,title_tokens,text,text_tokens,label,dem_bias,neutral,rep_bias,cluster
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real,0.440000,0.17,0.390000,3
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real,0.240000,0.31,0.450000,2
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real,0.490000,0.07,0.440000,3
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real,0.369167,0.04,0.590833,0
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real,0.410000,0.06,0.530000,0
...,...,...,...,...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real,0.450000,0.12,0.430000,3
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real,0.360000,0.19,0.450000,2
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real,0.310000,0.22,0.470000,2
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real,0.280000,0.30,0.420000,2


In [43]:
fn_export[['Prob_Fake','Prob_Real']] = clf.predict_proba(export_sparse)
fn_export

Unnamed: 0,title,title_tokens,text,text_tokens,label,dem_bias,neutral,rep_bias,cluster,Prob_Fake,Prob_Real
0,muslims busted they stole millions in govt ben...,"[muslim, busted, stole, million, govt, benefit]",print they should pay all the back all the mon...,"[print, pay, back, money, plus, interest, enti...",Real,0.440000,0.17,0.390000,3,0.497578,0.502422
1,re why did attorney general loretta lynch plea...,"[attorney, general, loretta, lynch, plead, fifth]",why did attorney general loretta lynch plead t...,"[attorney, general, loretta, lynch, plead, fif...",Real,0.240000,0.31,0.450000,2,0.501789,0.498211
2,breaking weiner cooperating with fbi on hillar...,"[breaking, weiner, cooperating, fbi, hillary, ...",red state \nfox news sunday reported this mor...,"[red, state, fox, news, sunday, reported, morn...",Real,0.490000,0.07,0.440000,3,0.501331,0.498669
3,pin drop speech by father of daughter kidnappe...,"[pin, drop, speech, father, daughter, kidnappe...",email kayla mueller was a prisoner and torture...,"[email, kayla, mueller, prisoner, tortured, is...",Real,0.369167,0.04,0.590833,0,0.348352,0.651648
4,fantastic trumps point plan to reform healthc...,"[fantastic, trump, point, plan, reform, health...",email healthcare reform to make america great ...,"[email, healthcare, reform, make, america, gre...",Real,0.410000,0.06,0.530000,0,0.336547,0.663453
...,...,...,...,...,...,...,...,...,...,...,...
2040,why never trumpers must reconsider,"[never, trumpers, must, reconsider]",prof canoes reek of genocide white privilege c...,"[prof, canoe, reek, genocide, white, privilege...",Real,0.450000,0.12,0.430000,3,0.499266,0.500734
2041,election crossroads socialism or capitalism,"[election, crossroad, socialism, capitalism]",teens walk free after gangrape conviction judg...,"[teen, walk, free, gangrape, conviction, judge...",Real,0.360000,0.19,0.450000,2,0.492814,0.507186
2042,reasons ill vote for trump,"[reason, ill, vote, trump]",school named for munichmassacre mastermind ter...,"[school, named, munichmassacre, mastermind, te...",Real,0.310000,0.22,0.470000,2,0.488326,0.511674
2043,our new country women and minorities hit hardest,"[new, country, woman, minority, hit, hardest]",wars and rumors of wars russia unveils satan ...,"[war, rumor, war, russia, unveils, satan, miss...",Real,0.280000,0.30,0.420000,2,0.506501,0.493499


In [45]:
fn_export.to_csv("assets/all_predictions.csv")