## Problem

Detecting whether an article is Fake News using Stance Detection. Stance detection takes a headline and body text of an article and classifies the stance of the body text relative to headline. 

In [105]:
import pandas as pd 

import os
import re
import nltk
from sklearn import feature_extraction
from tqdm import tqdm


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
from nltk.stem import WordNetLemmatizer

In [106]:
bodies = pd.read_csv('./train_bodies.csv')
stances = pd.read_csv('./train_stances.csv')


# EDA

In [107]:
bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [108]:
stances.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [109]:
stances[stances['Body ID'] == 712]

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss
3974,Mexico police find mass grave near site 43 stu...,712,unrelated
4936,Mexico Says Missing Students Not Found In Firs...,712,unrelated
5210,New iOS 8 bug can delete all of your iCloud do...,712,unrelated
5863,Return of the Mac: Seth Rogen in talks to star...,712,discuss
6199,Seth Rogen Is Woz,712,discuss
6756,Mexico finds 4 more graves at site of suspecte...,712,unrelated
7526,Are missing students in mass graves found near...,712,unrelated
9003,Mexico prosecutor: Students not in 1st mass gr...,712,unrelated


In [110]:
stances['Body ID'].value_counts()

1921    187
1948    175
40      172
524     171
1549    166
304     154
1385    151
125     145
2367    143
220     141
1438    141
195     140
2296    136
35      131
1786    131
1883    131
2520    127
1034    127
2252    126
1574    125
2307    125
527     125
2175    124
1627    123
2404    123
1289    122
2115    121
2096    120
1040    118
1893    117
       ... 
907       1
370       1
210       1
146       1
114       1
1542      1
63        1
76        1
390       1
515       1
193       1
464       1
355       1
323       1
624       1
282       1
18        1
797       1
701       1
362       1
2311      1
6         1
915       1
70        1
151       1
376       1
140       1
307       1
1066      1
59        1
Name: Body ID, Length: 1683, dtype: int64

In [111]:
stances['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [112]:
print('Bodies shape:', bodies.shape)
print('Stances shape:', stances.shape)


Bodies shape: (1683, 2)
Stances shape: (49972, 3)


In [113]:
stances['Stance'].value_counts()/stances.shape[0]

unrelated    0.731310
discuss      0.178280
agree        0.073601
disagree     0.016809
Name: Stance, dtype: float64

In [114]:
#why are stances rows much larger

In [115]:
bodies.isnull().sum()

Body ID        0
articleBody    0
dtype: int64

In [116]:
stances.isnull().sum()

Headline    0
Body ID     0
Stance      0
dtype: int64

In [117]:
bodies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683 entries, 0 to 1682
Data columns (total 2 columns):
Body ID        1683 non-null int64
articleBody    1683 non-null object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [118]:
stances.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49972 entries, 0 to 49971
Data columns (total 3 columns):
Headline    49972 non-null object
Body ID     49972 non-null int64
Stance      49972 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [119]:
print("Total stances: " + str(len(stances.Stance)))
print("Total article bodies: " + str(len(bodies.articleBody)))

Total stances: 49972
Total article bodies: 1683


In [120]:
#merge on bodyID 

df_all = stances.merge(bodies, on = 'Body ID')
df_all = df_all.drop('Stance', axis =1)

In [121]:
df_all.isnull().sum()

Headline       0
Body ID        0
articleBody    0
dtype: int64

In [122]:
df_all.shape

(49972, 3)

## Baseline

In [123]:
stances['Stance'].value_counts(normalize=True)

unrelated    0.731310
discuss      0.178280
agree        0.073601
disagree     0.016809
Name: Stance, dtype: float64

## NLP

In [124]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

In [125]:
cvec_body = CountVectorizer(stop_words = 'english')

train_body_cvec = cvec_body.fit_transform(X_train['articleBody'])
test_body_cvec = cvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_cvec, y_train)
lr.score(test_body_cvec, y_test)

0.7320899703834147

In [126]:
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

In [127]:
cvec_headline= CountVectorizer(stop_words = 'english')

train_headline_cvec = cvec_headline.fit_transform(X_train['Headline'])
test_headline_cvec = cvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_cvec, y_train)
lr.score(test_headline_cvec, y_test)

0.7313495557512207

In [128]:
#combine headline and body cvec and that will be x_train
#fit model on X_train 
#if model doesnt do well do truncated svd then cosine similarity between bodies and headline which will become a feature 


In [129]:
#do i still need these 
df  = pd.DataFrame(X_test_count.todense(),
                   columns=cvec.get_feature_names(), index = [bodies.articleBody]) 
df.head()


NameError: name 'X_test_count' is not defined

In [None]:
df.sort_index(axis = 0, ascending=False).head()

In [None]:
df.sum().sort_values(ascending=False).head(10)

In [None]:
df = df.reindex_axis(df.sum().sort_values(ascending=False).index, axis =1)
df

# Hashing Vectorizer

In [None]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

In [None]:
hvec_body = HashingVectorizer(stop_words = 'english')

train_body_hvec = hvec_body.fit_transform(X_train['articleBody'])
test_body_hvec = hvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_hvec, y_train)
lr.score(test_body_hvec, y_test)

In [None]:
#headline
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

In [None]:
hvec_headline = HashingVectorizer(stop_words = 'english')

train_headline_hvec = hvec_headline.fit_transform(X_train['Headline'])
test_headline_hvec = hvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_hvec, y_train)
lr.score(test_headline_hvec, y_test)

In [None]:
df_hv  = pd.DataFrame(hvec.transform(X_test).todense(), index=[bodies.articleBody])  
df_hv.head()

In [None]:
df_hv.sum().sort_values(ascending=False).head(10)

In [None]:
df_hv = df_hv.reindex_axis(df_hv.sum().sort_values(ascending=False).index, axis =1)
df_hv

In [None]:
#what can i do with this 

# TF IDF 

In [None]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

In [None]:
tvec_body = TfidfVectorizer(stop_words = 'english')

train_body_tvec = tvec_body.fit_transform(X_train['articleBody'])
test_body_tvec = tvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_tvec, y_train)
lr.score(test_body_tvec, y_test)

In [None]:
#headline
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

In [None]:
tvec_headline = TfidfVectorizer(stop_words = 'english')

train_headline_tvec = tvec_headline.fit_transform(X_train['Headline'])
test_headline_tvec = tvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_tvec, y_train)
lr.score(test_headline_tvec, y_test)

In [None]:
df_tfidf  = pd.DataFrame(tvec.transform(X_test).todense(),
                   columns=tvec.get_feature_names(),
                   index=[bodies.articleBody])
df_tfidf.head()
#df.transpose().sort_values('bodies.articleBody', ascending=False).head(10).transpose()

In [None]:
df_tfidf.sort_index(axis = 0, ascending=False).head()

In [None]:
df_tfidf.sum().sort_values(ascending=False).head(10)

In [None]:
df_tfidf = df_tfidf.reindex_axis(df_tfidf.sum().sort_values(ascending=False).index, axis =1)
df_tfidf

## LDA

In [132]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # stemming
    #porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    
    text_processed = [wordnet_lemmatizer.lemmatize(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return text_processed ## <-- we're keeping our words distinct

In [133]:
bodies['txt_process'] = bodies['articleBody'].apply(text_process)

In [143]:
bodies['txt_process'].head()

0    [small, meteorite, crashed, wooded, area, nica...
1    [last, week, hinted, come, ebola, fear, spread...
2    [newser, wonder, long, quarter, pounder, chees...
3    [posting, photo, gun, toting, child, online, i...
4    [least, 25, suspected, boko, haram, insurgent,...
Name: txt_process, dtype: object

In [135]:
df_all['Headline'] = df_all['Headline'].apply(text_process)

In [136]:
df_all['Headline'].head()

0    [police, find, mass, graf, least, 15, body, ne...
1           [seth, rogen, play, apple, steve, wozniak]
2    [mexico, police, find, mass, grave, near, site...
3    [mexico, say, missing, student, found, first, ...
4          [new, io, 8, bug, delete, icloud, document]
Name: Headline, dtype: object

## Fit LDA Model

In [137]:
from gensim import corpora, models
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

np.random.seed(42)

In [144]:
dictionary = corpora.Dictionary(bodies['txt_process'])

corpus = [dictionary.doc2bow(text) for text in bodies['txt_process']]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 20,             # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [145]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [146]:
dictionary = corpora.Dictionary(list(df_all['Headline']))

corpus = [dictionary.doc2bow(text) for text in df_all['Headline']]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 20,             # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [147]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

### preprocessing for LSA

In [149]:
import re

In [150]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    
    return text

  text = re.sub('[\d]','',text)


In [151]:
df_all['articleBody'] = df_all['articleBody'].map(str)
df_all['articleBody'] = df_all['articleBody'].apply(cleaner)

In [152]:
df_all['articleBody'].iloc[1]

'danny boyle is directing the untitled filmseth rogen is being eyed to play apple cofounder steve wozniak in sonys steve jobs biopicdanny boyle is directing the untitled film based on walter isaacsons book and adapted by aaron sorkin which is one of the most anticipated biopics in recent yearsnegotiations have not yet begun and its not even clear if rogen has an official offer but the producers  scott rudin guymon casady and mark gordon  have set their sights on the talent and are in talksof course this may all be for naught as christian bale the actor who is to play jobs is still in the midst of closing his deal sources say that dealmaking process is in a sensitive stageinsiders say boyle will is flying to los angeles to meet with actress to play one of the female leads an assistant to jobs insiders say that jessica chastain is one of the actresses on the meeting listwozniak known as woz cofounded apple with jobs and ronald wayne he first met jobs when they worked at atari and later w

In [182]:
df_all['Headline'] = df_all['Headline'].map(str)
df_all['Headline'] = df_all['Headline'].apply(cleaner)

### tfidf vectorizer

In [153]:
tfidf_vector = TfidfVectorizer(min_df = 5, stop_words='english')

In [154]:
articleBody_matrix_sparse = tfidf_vector.fit_transform(df_all['articleBody'])
articleBody_matrix_sparse

<49972x34009 sparse matrix of type '<class 'numpy.float64'>'
	with 6942088 stored elements in Compressed Sparse Row format>

In [155]:
articleBody_df_tfdf = pd.DataFrame(articleBody_matrix_sparse.toarray(),
                                 index = df_all.index,
                                 columns = tfidf_vector.get_feature_names())

In [156]:
articleBody_df_tfdf.head()

Unnamed: 0,aafia,aamaq,aamir,aan,aapl,aaron,aback,abadam,abadi,abagnale,...,zobl,zone,zoneclick,zones,zonethis,zoning,zoology,zubeyr,zuckerberg,zulu
0,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
headline_matrix_sparse = tfidf_vector.fit_transform(df_all['Headline'])
headline_matrix_sparse

<49972x2736 sparse matrix of type '<class 'numpy.float64'>'
	with 388147 stored elements in Compressed Sparse Row format>

In [185]:
headline_df_tfdf = pd.DataFrame(headline_matrix_sparse.toarray(),
                                 index = df_all.index,
                                 columns = tfidf_vector.get_feature_names())

In [186]:
headline_df_tfdf.head()

Unnamed: 0,aaron,abandoned,abdel,abdi,abducted,abducting,abdul,abdullah,abort,aborted,...,younger,youngest,youtube,ypg,yum,zack,zehaf,zeppelin,zero,zhejiang
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
from sklearn.decomposition import TruncatedSVD

In [159]:
n_components = 100
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i) for i in range(n_components)]

In [160]:
articleBody_svd_matrix = SVD.fit_transform(articleBody_df_tfdf)
articleBody_svd_matrix

array([[ 0.0799979 ,  0.18888767, -0.01010505, ...,  0.00212747,
        -0.00870781,  0.00280373],
       [ 0.0799979 ,  0.18888767, -0.01010505, ...,  0.00212747,
        -0.00870781,  0.00280373],
       [ 0.0799979 ,  0.18888767, -0.01010505, ...,  0.00212747,
        -0.00870781,  0.00280373],
       ...,
       [ 0.10207265, -0.00686561, -0.00822395, ..., -0.01550385,
        -0.0019225 ,  0.01680216],
       [ 0.15893976,  0.45728715, -0.05688312, ...,  0.01927547,
        -0.004254  ,  0.00634994],
       [ 0.16686514,  0.6082468 , -0.09006886, ...,  0.029364  ,
         0.03484511,  0.02267684]])

In [187]:
headline_svd_matrix = SVD.fit_transform(headline_df_tfdf)
headline_svd_matrix

array([[ 0.01147537,  0.009357  ,  0.00474595, ..., -0.04074215,
        -0.07793695, -0.01953358],
       [ 0.00723116,  0.05075075, -0.00227577, ..., -0.01519172,
         0.01178497, -0.00019412],
       [ 0.01427182,  0.00729172,  0.00614611, ..., -0.02155259,
        -0.050534  , -0.0085029 ],
       ...,
       [ 0.01233513,  0.00230643,  0.00525987, ...,  0.06313543,
        -0.00373034, -0.00339035],
       [ 0.0173522 ,  0.01500713,  0.00583575, ...,  0.0009198 ,
         0.02221016,  0.05111865],
       [ 0.00979537,  0.0103014 ,  0.00912616, ...,  0.01380942,
         0.01814374,  0.04582914]])

In [161]:
articleBody_svd_matrix.shape

(49972, 100)

In [162]:
search_term = "islam"

In [163]:
search_term_vec = tfidf_vector.transform([search_term])

In [164]:
search_term_svd = SVD.transform(search_term_vec)

In [165]:
search_term_svd

array([[ 2.42590976e-02, -4.03931420e-03, -5.11271351e-03,
         8.06738138e-05,  2.94027089e-02, -8.02830511e-03,
         9.74977211e-03, -7.39172684e-03, -2.46146238e-03,
        -1.02124662e-02, -1.00381510e-02, -9.17303445e-03,
        -1.05823183e-02,  7.33919926e-03,  3.37367135e-05,
        -8.75444925e-03, -3.40298571e-03, -8.76318011e-03,
        -9.99637735e-03,  3.86586203e-03,  1.29864184e-02,
         1.62948096e-02, -4.43517111e-03, -6.49385603e-03,
         2.21997016e-03,  1.89709042e-02, -5.12996915e-03,
        -5.95692411e-03,  2.58299917e-02, -1.03500478e-02,
         1.50887567e-02,  3.26561926e-03, -3.12458863e-03,
         1.04703677e-02,  2.05790973e-02, -2.51725926e-02,
        -1.80483327e-02, -9.67897117e-03,  1.20294567e-02,
         1.46139018e-03,  1.24639068e-02,  9.55068281e-03,
        -1.46204345e-02,  2.75723711e-03, -1.09328952e-02,
         1.36605932e-02, -7.71905925e-04, -1.44650170e-04,
        -9.14814477e-03, -1.79633811e-03,  2.82133472e-0

In [166]:
cosine_similarities = articleBody_svd_matrix.dot(search_term_svd.T).ravel()
#do for 1 row with another row --> feature 

In [167]:
cosine_similarities.shape

(49972,)

In [168]:
cosine_similarities

array([-0.00741365, -0.00741365, -0.00741365, ..., -0.00524145,
        0.00639547,  0.00159512])

In [169]:
cosine_similarities.argsort()[:-6:-1]

array([15632, 15608, 15621, 15620, 15619])

In [181]:
df_all.iloc[884]['articleBody']

'you want a gold apple watch you say then its going to cost you a lot the vanilla variant of apples newest wristworn wearable device only costs  however if youre willing to spend big cash on something that has the apple logo on it the company would be more than willing to accommodate according to analyst john gruber the gold version of the upcoming device may cost about as much as the monthly salary of a middleclass worker in the united states gruber predicts that the gold apple watch would cost  the figure is basically just a guess however its not an uneducated one most people think im joking when i say the gold ones are going to start at  i couldnt be more serious gruber wrote in a blog post the lowest conceivable price i could see for the edition models is   but the gold alone just as scrap metal might in fact be worth more than thatgruber claims that apple told him that the gold version of the apple watch is made of solid karat gold it is not goldplated this caused him to revise hi

In [172]:
from gensim import corpora, models
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [173]:
from nltk.corpus import stopwords
def split_into_words(documents):
    docs = []
    for doc in documents:
        text_processed = doc.split(' ')
        text_processed = list(filter(lambda a: a != '', text_processed))
        text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
        docs.append(text_processed)
    return docs

In [174]:
texts = split_into_words(df_all['articleBody'])

In [177]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

In [178]:
ldamodel = models.ldamodel.LdaModel(corpus,
                                   id2word = dictionary,
                                   num_topics = 50,
                                   passes = 5,
                                   minimum_probability = 0.01)

In [179]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)