In [210]:
import pickle
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.cluster import KMeans

import warnings
warnings.simplefilter('ignore')

RANDOM_SEED = 42

# Cleaning The Data - dataset from Deep Blue

With our text data, we are going to apply some of the text pre-processing techniques. Since this cleaning process can go on forever. There's always an exception to every cleaning steps. So, we're going to do this process in a few rounds.

**Below are the steps we will be applying to our dataset:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words
* Stemming & Lemmatization


In [211]:
data_bias=pd.read_excel('assets\pb_spinde.xlsx')
data_bias=data_bias[['article','type']]
data_bias.isnull().values.any()  # check if there are any missing values

True

In [212]:
data_bias.info()  # there are 100 rows has NaN value in column'article'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   article  1600 non-null   object
 1   type     1700 non-null   object
dtypes: object(2)
memory usage: 26.7+ KB


In [213]:
data_bias = data_bias.dropna().reset_index(drop=True)  # decide to drop the rows with missing value
data_bias.head()

Unnamed: 0,article,type
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,center
1,"FRISCO, Texas — The increasingly bitter disput...",left
2,Speaking to the country for the first time fro...,left
3,A professor who teaches climate change classes...,right
4,The left has a thing for taking babies hostage...,right


In [214]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

# Let's take a look at the updated text
data_clean = pd.DataFrame(data_bias.article.apply(round1))
data_clean

Unnamed: 0,article
0,youtube says no ‘deepfakes’ or ‘birther’ video...
1,frisco texas — the increasingly bitter dispute...
2,speaking to the country for the first time fro...
3,a professor who teaches climate change classes...
4,the left has a thing for taking babies hostage...
...,...
1595,the house democrats’ coronavirus recovery bill...
1596,there are many reasons that republicans and co...
1597,a man’s penis becomes a female penis once a ma...
1598,as a selfdescribed democratic socialist sen be...


In [215]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”—…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.article.apply(round2))
data_clean

Unnamed: 0,article
0,youtube says no deepfakes or birther videos wi...
1,frisco texas the increasingly bitter dispute ...
2,speaking to the country for the first time fro...
3,a professor who teaches climate change classes...
4,the left has a thing for taking babies hostage...
...,...
1595,the house democrats coronavirus recovery bill ...
1596,there are many reasons that republicans and co...
1597,a mans penis becomes a female penis once a man...
1598,as a selfdescribed democratic socialist sen be...


### Removing Stopwords

In [216]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#in case, if we need to remove more stopwords us code below
# stop_words.append('new words')

data_clean['article'] = data_clean['article'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
data_clean

Unnamed: 0,article
0,youtube says deepfakes birther videos toughene...
1,frisco texas increasingly bitter dispute ameri...
2,speaking country first time oval office tuesda...
3,professor teaches climate change classes subje...
4,left thing taking babies hostage perfect examp...
...,...
1595,house democrats coronavirus recovery bill allo...
1596,many reasons republicans conservative activist...
1597,mans penis becomes female penis man declares t...
1598,selfdescribed democratic socialist sen bernie ...


### Tokenize the sentence

In [217]:
data_clean['article'] = data_clean.article.apply(lambda y: [x for x in word_tokenize(y)])
data_clean

Unnamed: 0,article
0,"[youtube, says, deepfakes, birther, videos, to..."
1,"[frisco, texas, increasingly, bitter, dispute,..."
2,"[speaking, country, first, time, oval, office,..."
3,"[professor, teaches, climate, change, classes,..."
4,"[left, thing, taking, babies, hostage, perfect..."
...,...
1595,"[house, democrats, coronavirus, recovery, bill..."
1596,"[many, reasons, republicans, conservative, act..."
1597,"[mans, penis, becomes, female, penis, man, dec..."
1598,"[selfdescribed, democratic, socialist, sen, be..."


### Stemming

This process will chops off the ends of words in the hope of achieving this goal correctly most of the time

**(this is optional)**

In [218]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

data_clean_stemmed = data_clean.copy()

data_clean_stemmed['article'] = data_clean_stemmed['article'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
data_clean_stemmed

Unnamed: 0,article
0,"[youtub, say, deepfak, birther, video, toughen..."
1,"[frisco, texa, increas, bitter, disput, americ..."
2,"[speak, countri, first, time, oval, offic, tue..."
3,"[professor, teach, climat, chang, class, subje..."
4,"[left, thing, take, babi, hostag, perfect, exa..."
...,...
1595,"[hous, democrat, coronavirus, recoveri, bill, ..."
1596,"[mani, reason, republican, conserv, activist, ..."
1597,"[man, peni, becom, femal, peni, man, declar, t..."
1598,"[selfdescrib, democrat, socialist, sen, berni,..."


### Lemmatization

In this process, we use a vocabulary and morphological analysis of words which is aiming to remove inflectional endings only and to return the base or dictionary form of a word

In [219]:
from nltk import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

data_clean_lemma = data_clean.copy()

data_clean_lemma['article']= data_clean_lemma['article'].apply(lemmatize_text)
data_clean_lemma

Unnamed: 0,article
0,"[youtube, say, deepfakes, birther, video, toug..."
1,"[frisco, texas, increasingly, bitter, dispute,..."
2,"[speaking, country, first, time, oval, office,..."
3,"[professor, teach, climate, change, class, sub..."
4,"[left, thing, taking, baby, hostage, perfect, ..."
...,...
1595,"[house, democrat, coronavirus, recovery, bill,..."
1596,"[many, reason, republican, conservative, activ..."
1597,"[man, penis, becomes, female, penis, man, decl..."
1598,"[selfdescribed, democratic, socialist, sen, be..."


In [220]:
data_cleanned = data_clean_lemma.copy()

data_bias['type'] = data_bias.type.replace({'center':0,'left':-1,'right':1}) # replace the text labels with numbers
data_cleanned['type'] = data_bias['type']  # combine with the cleaned dataset
data_cleanned

Unnamed: 0,article,type
0,"[youtube, say, deepfakes, birther, video, toug...",0
1,"[frisco, texas, increasingly, bitter, dispute,...",-1
2,"[speaking, country, first, time, oval, office,...",-1
3,"[professor, teach, climate, change, class, sub...",1
4,"[left, thing, taking, baby, hostage, perfect, ...",1
...,...,...
1595,"[house, democrat, coronavirus, recovery, bill,...",1
1596,"[many, reason, republican, conservative, activ...",-1
1597,"[man, penis, becomes, female, penis, man, decl...",1
1598,"[selfdescribed, democratic, socialist, sen, be...",1


NOTE: This data cleaning: **text pre-processing step** could go on for a while, but we are going to stop for now and try it in the modeling part. After that, if we see that the results don't make sense or could be improved, we will come back and make more edits

**More data cleaning steps after tokenization:**
* Stemming 
* Parts of speech tagging
* Create bi-grams or tri-grams (such as 'thank you' into one term)
* Deal with typos
* And more...

In [221]:
# Let's pickle it for later use
data_cleanned.to_pickle("data_cleanned_corpus.pkl")

# Continuing on the Model part (copied from Jame's code)
## Part 1a: Political Bias Modeling

First we want to build a model of political bias using features that will be available in our primary dataset. We'll import the Spinde political bias dataset and select the article text and bias rating columns. Then, we'll vectorize the article text and train the model.

In [222]:
pb_reduced = data_bias.copy()
pb_reduced['tokens'] = data_cleanned['article']
pb_reduced

Unnamed: 0,article,type,tokens
0,YouTube says no ‘deepfakes’ or ‘birther’ video...,0,"[youtube, say, deepfakes, birther, video, toug..."
1,"FRISCO, Texas — The increasingly bitter disput...",-1,"[frisco, texas, increasingly, bitter, dispute,..."
2,Speaking to the country for the first time fro...,-1,"[speaking, country, first, time, oval, office,..."
3,A professor who teaches climate change classes...,1,"[professor, teach, climate, change, class, sub..."
4,The left has a thing for taking babies hostage...,1,"[left, thing, taking, baby, hostage, perfect, ..."
...,...,...,...
1595,The House Democrats’ coronavirus recovery bill...,1,"[house, democrat, coronavirus, recovery, bill,..."
1596,There are many reasons that Republicans and co...,-1,"[many, reason, republican, conservative, activ..."
1597,A man’s penis becomes a female penis once a ma...,1,"[man, penis, becomes, female, penis, man, decl..."
1598,"As a self-described Democratic socialist, Sen....",1,"[selfdescribed, democratic, socialist, sen, be..."


In [223]:
#Now we'll train the Word2Vec model on our text tokens.
wv_mod = Word2Vec(pb_reduced['tokens'], seed = RANDOM_SEED)

In [224]:
#We'll extract the vectors from the model...
vectors = wv_mod.wv
#...and since each word is a vector of 100 numbers, we'll take the mean of all word vectors in a given article 
#to represent the article as a whole
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in pb_reduced.tokens])

In [225]:
#Finally, we'll train a Random Forest classifier on the vectorized text to predict article bias.
X_train, X_test, y_train, y_test = train_test_split(vec_frame, pb_reduced.type, test_size=0.2, random_state=RANDOM_SEED)

In [226]:
clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.809375

# Cleaning The Data - dataset from Kaggle

In [227]:
data_news=pd.read_csv('assets\train.csv')
data_news.isnull().values.any()  # check if there are any missing values

True

In [228]:
data_news.isna().any()[lambda x: x] # check which column has missing values

title     True
author    True
text      True
dtype: bool

In [229]:
print ('Total number of rows',  len(data_news))
print('Number of title with missing value', data_news['title'].isna().sum())
print('Number of author with missing value', data_news['author'].isna().sum())
print('Number of text with missing value', data_news['text'].isna().sum())

# it appears that the rows with miss value only has a small number, decide to remove
data_news = data_news.dropna().reset_index(drop=True)

Total number of rows 20800
Number of title with missing value 558
Number of author with missing value 1957
Number of text with missing value 39


In [230]:
data_news

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
18280,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
18281,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
18282,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
18283,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


### Applying the same cleaning rounds from previous code
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words
* Stemming & Lemmatization


In [231]:
data_news_clean = pd.DataFrame(data_news.text.apply(round1))
data_news_clean = pd.DataFrame(data_news_clean.text.apply(round2))

More cleaning are required 

**After examine the 'text' columns, it appears that some characters are encoded with UTF-8 format, and ascii value
(such as Ã¼, Ð», â€“.etc)**

In [232]:
# Apply a third round of cleaning

'''Remove some UTF-8 format characters and converting the ascii value which missed from previous rounds'''
data_news_clean['text'] = data_news_clean['text'].apply(lambda x: x.encode('ascii', 'ignore').decode("ascii","ignore"))
data_news_clean

Unnamed: 0,text
0,house dem aide we didnt even see comeys letter...
1,ever get the feeling your life circles the rou...
2,why the truth might get you fired october th...
3,videos civilians killed in single us airstrik...
4,print an iranian woman has been sentenced to s...
...,...
18280,rapper t i unloaded on black celebrities who m...
18281,when the green bay packers lost to the washing...
18282,the macys of today grew from the union of seve...
18283,nato russia to hold parallel exercises in balk...


### Removing Stop, Tokenization, Lemmatization

In [233]:
# this process will run a bit longer since we are doing all three together 

data_news_clean['text'] = data_news_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
data_news_clean['text'] = data_news_clean.text.apply(lambda y: [x for x in word_tokenize(y)])
data_news_lemma = data_news_clean.copy()
data_news_lemma['text']= data_news_lemma['text'].apply(lemmatize_text)

In [234]:
# Let's pickle it for later use
data_news_lemma.to_pickle("data_news_lemma_corpus.pkl")

# Continuing on the Model part (copied from Jame's code)
## Part 1b: Applying the Model

Now, we want to predict the political bias of the target fake news dataset. We'll save these predictions as probabilities, which we'll use as additional features for clustering and trustworthiness prediction.

In [235]:
fn_kag_tok = data_news.copy()
fn_kag_tok['text_tokens'] = data_news_lemma['text']

In [236]:
#Some articles have very few words, so we'll drop any rows with fewer than 30 tokens.
fn_kag_tok['tmp'] = fn_kag_tok['text_tokens'].apply(lambda x: len(x))
fn_kag_tok = fn_kag_tok[fn_kag_tok['tmp']>30]
fn_kag_tok = fn_kag_tok.drop(columns='tmp')
fn_kag_tok

Unnamed: 0,id,title,author,text,label,text_tokens
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[house, dem, aide, didnt, even, see, comeys, l..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"[ever, get, feeling, life, circle, roundabout,..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,"[truth, might, get, fired, october, tension, i..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,"[video, civilian, killed, single, u, airstrike..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,"[print, iranian, woman, sentenced, six, year, ..."
...,...,...,...,...,...,...
18280,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,"[rapper, unloaded, black, celebrity, met, dona..."
18281,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"[green, bay, packer, lost, washington, redskin..."
18282,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,"[macys, today, grew, union, several, great, na..."
18283,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"[nato, russia, hold, parallel, exercise, balka..."


In [237]:
#Now we'll apply the Word2Vec model we generated above to our tokens to vectorize the text.
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in fn_kag_tok.text_tokens])

In [238]:
#Now we apply the Random Forest classifier to our vectorized text and save out the predicted probabilities.
preds = pd.DataFrame(clf.predict_proba(vec_frame), columns=['dem_bias','neutral','rep_bias'])

In [239]:
#Finally, we'll rejoin the predictions to the original dataset.
fn_kag_reduced = fn_kag_tok.copy().reset_index(drop=True)
fn_kag_reduced = fn_kag_reduced.join(preds)
fn_kag_reduced

Unnamed: 0,id,title,author,text,label,text_tokens,dem_bias,neutral,rep_bias
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[house, dem, aide, didnt, even, see, comeys, l...",0.46,0.04,0.50
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"[ever, get, feeling, life, circle, roundabout,...",0.57,0.04,0.39
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,"[truth, might, get, fired, october, tension, i...",0.50,0.08,0.42
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,"[video, civilian, killed, single, u, airstrike...",0.19,0.25,0.56
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,"[print, iranian, woman, sentenced, six, year, ...",0.37,0.22,0.41
...,...,...,...,...,...,...,...,...,...
17487,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,"[rapper, unloaded, black, celebrity, met, dona...",0.40,0.05,0.55
17488,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"[green, bay, packer, lost, washington, redskin...",0.27,0.25,0.48
17489,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,"[macys, today, grew, union, several, great, na...",0.34,0.11,0.55
17490,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"[nato, russia, hold, parallel, exercise, balka...",0.41,0.18,0.41


# Continuing on the Model part (copied from Jame's code)
## Part 2: Clustering
Once we have all the features we want, we'll do unsupervised clustering. Ideally we'd want to do some evaluations to find an ideal number of clusters, but for now we'll just go with 4.

We'll need to re-vectorize the text, as the political bias vectors won't work here. Also, we'd probably want to vectorize both headline and article body, but for now I'll just vectorize the article body.

In [240]:
#Since we already have the tokenized text from above, we can just go ahead and train the new Word2Vec model on those tokens.
wv_mod = Word2Vec(fn_kag_reduced['text_tokens'], seed = RANDOM_SEED)

In [241]:
#Again we'll extract and average the word vectors.
vectors = wv_mod.wv
vec_frame = pd.DataFrame([vectors.get_mean_vector(x) for x in fn_kag_reduced.text_tokens])

In [242]:
#We'll join the new word vectors with the bias estimates we generate above.
all_feat_df = vec_frame.join(fn_kag_reduced).drop(columns=['id','title','author','text','label','text_tokens'])

In [243]:
#Finally we'll build our clustering model...
cls = KMeans(4, random_state=RANDOM_SEED).fit(all_feat_df)

In [244]:
#...and add the predicted clusters back into the vector dataframe.
all_feat_df['cluster'] = cls.predict(all_feat_df)
all_feat_df

AttributeError: 'NoneType' object has no attribute 'split'

## Part 3: Supervised Learning

Now that we have all of our features and clusters, and article body text is already vectorized, we can train a classifier to predict whether a given article is misinformation or not.

In [245]:
#We've already done most of the work above, we'll just split up the dataset and build the model.
X_train, X_test, y_train, y_test = train_test_split(all_feat_df, fn_kag_reduced.label, test_size=0.2, random_state=RANDOM_SEED)

In [246]:
clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8985424406973421