# Fake News through Stance Detection

## Problem

Detecting whether an article is Fake News using Stance Detection. Stance detection takes a headline and body text of an article and classifies the stance of the body text relative to headline. 

In [20]:
import pandas as pd 

import os
import re
import nltk
from sklearn import feature_extraction
from tqdm import tqdm


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
from nltk.stem import WordNetLemmatizer

from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit

In [3]:
bodies = pd.read_csv('./train_bodies.csv')
stances = pd.read_csv('./train_stances.csv')


## EDA

In [5]:
bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [6]:
stances.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [7]:
stances[stances['Body ID'] == 712]

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss
3974,Mexico police find mass grave near site 43 stu...,712,unrelated
4936,Mexico Says Missing Students Not Found In Firs...,712,unrelated
5210,New iOS 8 bug can delete all of your iCloud do...,712,unrelated
5863,Return of the Mac: Seth Rogen in talks to star...,712,discuss
6199,Seth Rogen Is Woz,712,discuss
6756,Mexico finds 4 more graves at site of suspecte...,712,unrelated
7526,Are missing students in mass graves found near...,712,unrelated
9003,Mexico prosecutor: Students not in 1st mass gr...,712,unrelated


In [8]:
stances['Body ID'].value_counts()

1921    187
1948    175
40      172
524     171
1549    166
304     154
1385    151
125     145
2367    143
220     141
1438    141
195     140
2296    136
35      131
1786    131
1883    131
2520    127
1034    127
2252    126
1574    125
2307    125
527     125
2175    124
1627    123
2404    123
1289    122
2115    121
2096    120
1040    118
1893    117
       ... 
907       1
370       1
210       1
146       1
114       1
1542      1
63        1
76        1
390       1
515       1
193       1
464       1
355       1
323       1
624       1
282       1
18        1
797       1
701       1
362       1
2311      1
6         1
915       1
70        1
151       1
376       1
140       1
307       1
1066      1
59        1
Name: Body ID, Length: 1683, dtype: int64

In [9]:
stances['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [10]:
print('Bodies shape:', bodies.shape)
print('Stances shape:', stances.shape)


Bodies shape: (1683, 2)
Stances shape: (49972, 3)


In [11]:
stances['Stance'].value_counts()/stances.shape[0]

unrelated    0.731310
discuss      0.178280
agree        0.073601
disagree     0.016809
Name: Stance, dtype: float64

In [12]:
#why are stances rows much larger

In [13]:
bodies.isnull().sum()

Body ID        0
articleBody    0
dtype: int64

In [14]:
stances.isnull().sum()

Headline    0
Body ID     0
Stance      0
dtype: int64

In [15]:
bodies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683 entries, 0 to 1682
Data columns (total 2 columns):
Body ID        1683 non-null int64
articleBody    1683 non-null object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [16]:
stances.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49972 entries, 0 to 49971
Data columns (total 3 columns):
Headline    49972 non-null object
Body ID     49972 non-null int64
Stance      49972 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [17]:
print("Total stances: " + str(len(stances.Stance)))
print("Total article bodies: " + str(len(bodies.articleBody)))

Total stances: 49972
Total article bodies: 1683


In [4]:
#merge on bodyID 

df_all = stances.merge(bodies, on = 'Body ID')
#df_all = df_all.drop('Stance', axis =1)
df_all.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss,Danny Boyle is directing the untitled film\n\n...
2,Mexico police find mass grave near site 43 stu...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
3,Mexico Says Missing Students Not Found In Firs...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
4,New iOS 8 bug can delete all of your iCloud do...,712,unrelated,Danny Boyle is directing the untitled film\n\n...


In [19]:
df_all.isnull().sum()

Headline       0
Body ID        0
Stance         0
articleBody    0
dtype: int64

In [20]:
df_all.shape

(49972, 4)

## Baseline

In [21]:
stances['Stance'].value_counts(normalize=True)

unrelated    0.731310
discuss      0.178280
agree        0.073601
disagree     0.016809
Name: Stance, dtype: float64

# NLP

## Count Vectorizer

In [22]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

cvec_body = CountVectorizer(stop_words = 'english')

train_body_cvec = cvec_body.fit_transform(X_train['articleBody'])
test_body_cvec = cvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_cvec, y_train)
lr.score(test_body_cvec, y_test)

0.7320899703834147

In [23]:
#headline
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

cvec_headline= CountVectorizer(stop_words = 'english')

train_headline_cvec = cvec_headline.fit_transform(X_train['Headline'])
test_headline_cvec = cvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_cvec, y_train)
lr.score(test_headline_cvec, y_test)

0.7313495557512207

In [24]:
#combine headline and body cvec and that will be x_train
#fit model on X_train 
#if model doesnt do well do truncated svd then cosine similarity between bodies and headline which will become a feature 


In [25]:
#do i still need these 
#df  = pd.DataFrame(X_test_count.todense(),
                   #columns=cvec.get_feature_names(), index = [bodies.articleBody]) 
#df.head()


In [26]:
#df.sort_index(axis = 0, ascending=False).head()

In [27]:
#df.sum().sort_values(ascending=False).head(10)

In [28]:
#df = df.reindex_axis(df.sum().sort_values(ascending=False).index, axis =1)
#df

# Hashing Vectorizer

In [29]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

In [30]:
hvec_body = HashingVectorizer(stop_words = 'english')

train_body_hvec = hvec_body.fit_transform(X_train['articleBody'])
test_body_hvec = hvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_hvec, y_train)
lr.score(test_body_hvec, y_test)

0.7313095333386697

In [31]:
#headline
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

In [32]:
hvec_headline = HashingVectorizer(stop_words = 'english')

train_headline_hvec = hvec_headline.fit_transform(X_train['Headline'])
test_headline_hvec = hvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_hvec, y_train)
lr.score(test_headline_hvec, y_test)

0.7313095333386697

In [33]:
#df_hv = pd.DataFrame(hvec.transform(X_test).todense(), index=[bodies.articleBody])  
#df_hv.head()

In [34]:
#df_hv.sum().sort_values(ascending=False).head(10)

In [35]:
#df_hv = df_hv.reindex_axis(df_hv.sum().sort_values(ascending=False).index, axis =1)
#df_hv

In [36]:
#what can i do with this 

# TF IDF 

In [37]:
#body
X_train = df_all[['articleBody']]
y_train = stances['Stance']
X_test = df_all[['articleBody']]
y_test = stances['Stance']

In [38]:
tvec_body = TfidfVectorizer(stop_words = 'english')

train_body_tvec = tvec_body.fit_transform(X_train['articleBody'])
test_body_tvec = tvec_body.transform(X_test['articleBody'])

lr = LogisticRegression()
lr.fit(train_body_tvec, y_train)
lr.score(test_body_tvec, y_test)

0.7313095333386697

In [39]:
#headline
X_train = df_all[['Headline']]
y_train = stances['Stance']
X_test = df_all[['Headline']]
y_test = stances['Stance']

In [40]:
tvec_headline = TfidfVectorizer(stop_words = 'english')

train_headline_tvec = tvec_headline.fit_transform(X_train['Headline'])
test_headline_tvec = tvec_headline.transform(X_test['Headline'])

lr = LogisticRegression()
lr.fit(train_headline_tvec, y_train)
lr.score(test_headline_tvec, y_test)

0.7313095333386697

In [41]:
#df_tfidf  = pd.DataFrame(tvec.transform(X_test).todense(),
                   #columns=tvec.get_feature_names(),
                   #index=[bodies.articleBody])
#df_tfidf.head()
#df.transpose().sort_values('bodies.articleBody', ascending=False).head(10).transpose()

In [42]:
#df_tfidf.sort_index(axis = 0, ascending=False).head()

In [43]:
#df_tfidf.sum().sort_values(ascending=False).head(10)

In [44]:
#df_tfidf = df_tfidf.reindex_axis(df_tfidf.sum().sort_values(ascending=False).index, axis =1)
#df_tfidf

## LDA

In [5]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # stemming
    #porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    
    text_processed = [wordnet_lemmatizer.lemmatize(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return text_processed ## <-- we're keeping our words distinct

In [46]:
df_all['articleBody'] = df_all['articleBody'].apply(text_process)

In [47]:
df_all['articleBody'].head()

0    [danny, boyle, directing, untitled, film, seth...
1    [danny, boyle, directing, untitled, film, seth...
2    [danny, boyle, directing, untitled, film, seth...
3    [danny, boyle, directing, untitled, film, seth...
4    [danny, boyle, directing, untitled, film, seth...
Name: articleBody, dtype: object

In [48]:
df_all['Headline'] = df_all['Headline'].apply(text_process)

In [49]:
df_all['Headline'].head()

0    [police, find, mass, graf, least, 15, body, ne...
1           [seth, rogen, play, apple, steve, wozniak]
2    [mexico, police, find, mass, grave, near, site...
3    [mexico, say, missing, student, found, first, ...
4          [new, io, 8, bug, delete, icloud, document]
Name: Headline, dtype: object

## Fit LDA Model

In [50]:
from gensim import corpora, models
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

np.random.seed(42)

## LDA Article Body

In [51]:
dictionary = corpora.Dictionary(df_all['articleBody'])

corpus = [dictionary.doc2bow(text) for text in df_all['articleBody']]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 20,             # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [52]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

## LDA Headline

In [53]:
dictionary = corpora.Dictionary(list(df_all['Headline']))

corpus = [dictionary.doc2bow(text) for text in df_all['Headline']]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 20,             # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [54]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

### preprocessing for Latent Semantic Analysis

In [55]:
#need this cleaner to make vectorizers below work....idk why

In [56]:
import re

In [57]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    
    return text

  text = re.sub('[\d]','',text)


In [58]:
df_all['articleBody'] = df_all['articleBody'].map(str)
df_all['articleBody'] = df_all['articleBody'].apply(cleaner)

In [59]:
df_all['articleBody'].iloc[1]

'danny boyle directing untitled film seth rogen eyed play apple co founder steve wozniak sony steve job biopic danny boyle directing untitled film based walter isaacson book adapted aaron sorkin one anticipated biopics recent year negotiation yet begun even clear rogen official offer producer scott rudin guymon casady mark gordon set sight talent talk course may naught christian bale actor play job still midst closing deal source say dealmaking process sensitive stage insider say boyle flying los angeles meet actress play one female lead assistant job insider say jessica chastain one actress meeting list wozniak known woz co founded apple job ronald wayne first met job worked atari later responsible creating early apple computer'

In [60]:
df_all['Headline'] = df_all['Headline'].map(str)
df_all['Headline'] = df_all['Headline'].apply(cleaner)

In [61]:
df_all.head()
#why does this change my text processing back 

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,police find mass graf least body near mexico ...,712,unrelated,danny boyle directing untitled film seth rogen...
1,seth rogen play apple steve wozniak,712,discuss,danny boyle directing untitled film seth rogen...
2,mexico police find mass grave near site stude...,712,unrelated,danny boyle directing untitled film seth rogen...
3,mexico say missing student found first mass graf,712,unrelated,danny boyle directing untitled film seth rogen...
4,new io bug delete icloud document,712,unrelated,danny boyle directing untitled film seth rogen...


In [62]:
df_all.to_csv('./preprocessed.csv')

In [21]:
df_all = pd.read_csv('./preprocessed.csv')

In [22]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,Headline,Body ID,Stance,articleBody
0,0,police find mass graf least body near mexico ...,712,unrelated,danny boyle directing untitled film seth rogen...
1,1,seth rogen play apple steve wozniak,712,discuss,danny boyle directing untitled film seth rogen...
2,2,mexico police find mass grave near site stude...,712,unrelated,danny boyle directing untitled film seth rogen...
3,3,mexico say missing student found first mass graf,712,unrelated,danny boyle directing untitled film seth rogen...
4,4,new io bug delete icloud document,712,unrelated,danny boyle directing untitled film seth rogen...


### tfidf vectorizer

In [23]:
tfidf_vector = TfidfVectorizer(min_df = 5, stop_words='english')

articleBody_matrix_sparse = tfidf_vector.fit_transform(df_all['articleBody'])
articleBody_matrix_sparse

<49972x18408 sparse matrix of type '<class 'numpy.float64'>'
	with 6480420 stored elements in Compressed Sparse Row format>

In [24]:
articleBody_df_tfdf = pd.DataFrame(articleBody_matrix_sparse.toarray(),
                                 index = df_all.index,
                                 columns = tfidf_vector.get_feature_names())

In [65]:
articleBody_df_tfdf.head()

Unnamed: 0,aafia,aamaq,aamir,aan,aapl,aaron,aback,abadam,abadi,abagnale,...,zobl,zone,zoning,zoology,zor,zubayr,zubeyr,zuckerberg,zulu,zxycaeuwb
0,0.0,0.0,0.0,0.0,0.0,0.07737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.07737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.07737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.07737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.07737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
headline_matrix_sparse = tfidf_vector.fit_transform(df_all['Headline'])
headline_matrix_sparse

<49972x2736 sparse matrix of type '<class 'numpy.float64'>'
	with 388147 stored elements in Compressed Sparse Row format>

In [26]:
headline_df_tfdf = pd.DataFrame(headline_matrix_sparse.toarray(),
                                 index = df_all.index,
                                 columns = tfidf_vector.get_feature_names())

In [68]:
headline_df_tfdf.head()

Unnamed: 0,aaron,abandoned,abdel,abdi,abducted,abducting,abdul,abdullah,abort,aborted,...,younger,youngest,youtube,ypg,yum,zack,zehaf,zeppelin,zero,zhejiang
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Truncated SVD

In [27]:
from sklearn.decomposition import TruncatedSVD

In [28]:
n_components = 100
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i) for i in range(n_components)]

In [71]:
#lsa = TruncatedSVD(n_components=100, n_iter=100)
#lsa.fit(articleBody_df_tfdf)

In [72]:
#lsa.components_[0]

In [73]:
#terms = vectorizer.get_feature_names()
#for i, comp in enumerate(lsa.components_):
    #termsInComp = zip (terms, comp)
    #sortedTerms = sorted(termsInComp, key=lambda X: x[1], reverse = True)[:10]
    #print "Word %d:" % i
    #for term in sortedTerms:
        #print term[0]
        #print " "

In [29]:
articleBody_svd_matrix = SVD.fit_transform(articleBody_df_tfdf)
articleBody_svd_matrix

array([[ 0.09533844,  0.20917701, -0.02623359, ...,  0.00920228,
        -0.01856594, -0.00950194],
       [ 0.09533844,  0.20917701, -0.02623359, ...,  0.00920228,
        -0.01856594, -0.00950194],
       [ 0.09533844,  0.20917701, -0.02623359, ...,  0.00920228,
        -0.01856594, -0.00950194],
       ...,
       [ 0.10926746, -0.00405863, -0.01237419, ..., -0.00385819,
         0.001803  , -0.0149248 ],
       [ 0.17433644,  0.51243698, -0.07441508, ..., -0.00582191,
         0.00681073,  0.05076078],
       [ 0.16292307,  0.66241454, -0.12880881, ..., -0.02244739,
         0.00879433,  0.06297349]])

In [30]:
headline_svd_matrix = SVD.fit_transform(headline_df_tfdf)
headline_svd_matrix

array([[ 0.01147479,  0.00935641,  0.00474512, ...,  0.02327761,
        -0.04932604,  0.03435446],
       [ 0.00723131,  0.05075058, -0.00227545, ...,  0.04016022,
         0.01293309,  0.06570908],
       [ 0.01427152,  0.0072916 ,  0.00614599, ...,  0.01756991,
        -0.02915877,  0.00694797],
       ...,
       [ 0.01233547,  0.00230589,  0.00526141, ...,  0.00475152,
        -0.05665834, -0.02388791],
       [ 0.01735225,  0.01500701,  0.00583595, ...,  0.01591702,
        -0.02312003,  0.00565127],
       [ 0.00979544,  0.01030163,  0.00912464, ...,  0.03620044,
        -0.00023186, -0.01525898]])

In [76]:
articleBody_svd_matrix.shape

(49972, 100)

In [77]:
#search_term = "islam"

#use headline to compare to the boody 
#make a list forbody and article and loop thru each row 

In [78]:
#search_term_vec = tfidf_vector.transform([search_term])

In [79]:
#search_term_svd = SVD.transform(search_term_vec)

In [80]:
#search_term_svd

## Cosine Similarity

In [81]:
#cosine_similarities = articleBody_svd_matrix.dot(search_term_svd.T).ravel()
#do for 1 row with another row --> feature 

In [82]:
#cosine_similarities.shape

In [83]:
#cosine_similarities

In [84]:
#cosine_similarities.argsort()[:-6:-1]

In [85]:
#cosine_similarities_headline = headline_svd_matrix.dot(search_term_svd.T).ravel()

In [86]:
from gensim import corpora, models
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [87]:
#from nltk.corpus import stopwords
#def split_into_words(documents):
    #docs = []
    #for doc in documents:
        #text_processed = doc.split(' ')
        #text_processed = list(filter(lambda a: a != '', text_processed))
        #text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
        #docs.append(text_processed)
    #return docs

In [88]:
#texts = split_into_words(df_all['articleBody'])

## Cosine Similarity

In [31]:
import scipy.spatial as sp

In [32]:
#distances = 1 - sp.distance.cdist(articleBody_svd_matrix, headline_svd_matrix, 'cosine')

In [33]:
distances = []
for i, body in enumerate(articleBody_svd_matrix): 
    distances.append(sp.distance.cdist([body], [headline_svd_matrix[i]], 'cosine')[0][0])

In [45]:
df_all['distances'] = pd.Series(distances)

In [117]:
df_all.shape

(49972, 6)

### Eliminating Nulls

In [None]:
#df_fix = pd.DataFrame(df_all['distances'], df_all['Body ID'], df_all['Stance'])

In [95]:
df_fix.T.head()

Body ID,712,712,712,712,712,712,712,712,712,712,...,797,74,135,175,553,464,362,915,407,1066
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
unrelated,,,,,,,,,,,...,,,,,,,,,,
discuss,,,,,,,,,,,...,,,,,,,,,,
unrelated,,,,,,,,,,,...,,,,,,,,,,
unrelated,,,,,,,,,,,...,,,,,,,,,,
unrelated,,,,,,,,,,,...,,,,,,,,,,


In [96]:
#2 null values 
df_all.shape

(49972, 5)

In [35]:
df_all = df_all.dropna(axis=0)

#why does this change my df to not a df 

In [98]:
df_all.shape

(49970, 5)

In [36]:
df_all = stances.merge(bodies, on = 'Body ID')

In [46]:
df_all.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_dummy,distances
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3,0.848807
1,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss,Danny Boyle is directing the untitled film\n\n...,2,0.954702
2,Mexico police find mass grave near site 43 stu...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3,0.841187
3,Mexico Says Missing Students Not Found In Firs...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3,0.768969
4,New iOS 8 bug can delete all of your iCloud do...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3,1.033323


In [101]:
df_fix.head()

Stance,unrelated,discuss,unrelated,unrelated,unrelated,discuss,discuss,unrelated,unrelated,unrelated,...,agree,discuss,agree,discuss,agree,agree,agree,agree,discuss,discuss
Body ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
712,,,,,,,,,,,...,,,,,,,,,,
712,,,,,,,,,,,...,,,,,,,,,,
712,,,,,,,,,,,...,,,,,,,,,,
712,,,,,,,,,,,...,,,,,,,,,,
712,,,,,,,,,,,...,,,,,,,,,,


  _marks = attr.ib(convert=list)


KeyError: 'distances'

In [103]:
df_fix = df_fix.dropna(axis=0)

In [104]:
df_fix.shape

(0, 49972)

In [105]:
df_all[df_all['distances'].isnull()==True]

KeyError: 'distances'

### Adding labels to stances

In [37]:
from sklearn import preprocessing

In [38]:
le = preprocessing.LabelEncoder()
le.fit(df_all['Stance'])
print("Classes:", list(le.classes_))
stance_label = le.transform(df_all['Stance']) 


Classes: ['agree', 'disagree', 'discuss', 'unrelated']


In [39]:
stance_label

array([3, 2, 3, ..., 0, 2, 2])

In [40]:
df_all['stance_dummy'] = pd.Series(stance_label)

In [41]:
df_all.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_dummy
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3
1,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss,Danny Boyle is directing the untitled film\n\n...,2
2,Mexico police find mass grave near site 43 stu...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3
3,Mexico Says Missing Students Not Found In Firs...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3
4,New iOS 8 bug can delete all of your iCloud do...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,3


In [42]:
stance_dummy = pd.get_dummies(df_all['Stance'], drop_first=False)
stance_dummy.head()

Unnamed: 0,agree,disagree,discuss,unrelated
0,0,0,0,1
1,0,0,1,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [48]:
df_fix = pd.DataFrame([df_all['distances'], df_all['stance_dummy']]).T
df_fix = df_fix.dropna(axis=0)
df_fix.head()

Unnamed: 0,distances,stance_dummy
0,0.848807,3.0
1,0.954702,2.0
2,0.841187,3.0
3,0.768969,3.0
4,1.033323,3.0


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X = df_fix['distances'].reshape(df_fix['distances'].shape[0], 1)
y = df_fix['stance_dummy']

X_train, X_test, y_train, y_test = train_test_split(X, y)

  if __name__ == '__main__':


In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37477, 1), (12493, 1), (37477,), (12493,))

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
np.unique(y_train)

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_probas = lr.predict_proba(X_train)
y_true = y_train

import scikitplot as skplt
import matplotlib.pyplot as plt

skplt.metrics.plot_roc(y_true, y_probas, figsize=(7,7))
plt.show()

In [None]:
y_true.unique(), y_true.shape

In [None]:
np.unique(y_probas), y_probas.shape

In [None]:
y_probas

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

In [None]:
lr = LogisticRegression()
lr.fit(X_resampled, y_resampled)
lr.score(X_resampled, y_resampled)

# Model with cvec body, cvec head, and cosine similarity

In [None]:
cvec_vector = CountVectorizer(stop_words='english')

cvec_articleBody_matrix_sparse = cvec_vector.fit_transform(df_all['articleBody'])
cvec_articleBody_matrix_sparse

In [None]:
cvec_body_array = cvec_articleBody_matrix_sparse.toarray()

In [None]:
df_cvec = pd.DataFrame(cvec_body_array)

In [None]:
df_cvec.head()

In [None]:
cvec_vector = CountVectorizer(stop_words='english')

cvec_headline_matrix_sparse = cvec_vector.fit_transform(df_all['Headline'])

cvec_headline_matrix_sparse

In [None]:
cvec_vector.get_feature_names

In [None]:
cvec_headline_df = pd.DataFrame(cvec_headline_matrix_sparse.todense(),
                       columns = cvec_vector.get_feature_names())


In [None]:
cvec_headline_df.shape

In [None]:
df_fix.shape

In [None]:
df_headline_fix = pd.concat([df_fix, cvec_headline_df], axis=1)

In [None]:
df_headline_fix.fillna(value=0, inplace=True)

In [None]:
features = [col for col in df_headline_fix.columns if col != 'stance_dummy']

In [None]:
X = df_headline_fix[features]
y = df_headline_fix['stance_dummy']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

## GloVe


In [69]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

max_sequence_length = 50   # max number of words to consider in a review (i.e. the first 50...)
max_num_words = 4000       # maximum number of words to include in the vocabulary (i.e. top 10000 only)

# Instantiate the Tokenizer
tokenizer = Tokenizer(num_words=max_num_words)

# Fit the tokenizer, i.e. learn the vocab and id the most frequently occuring words
tokenizer.fit_on_texts(df_all['Headline'].values)

# Turn our texts to sequences of word indices
sequences = tokenizer.texts_to_sequences(df_all['Headline'].values)

# Save the look-up dictionary for words to indices (will need this later)
word_index = tokenizer.word_index

# Pad out our sequences by prepending zeros to all of our text sequences
padded_sequences_headline = pad_sequences(sequences, maxlen=max_sequence_length)

print(f'Found {len(word_index)} unique tokens/words')
print(f'The maximum word index is {padded_sequences.max()}')

Found 3879 unique tokens/words
The maximum word index is 24999


In [70]:
padded_sequences_headline.shape

(49972, 50)

In [71]:
padded_sequences_headline[0,:]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  248,  540,
        133,  361,   14,   12,  173, 2481, 2482,  206,  118, 1223, 1341,
        674,  143, 2483,   13,  248, 2484], dtype=int32)

In [79]:
max_sequence_length = 200   # max number of words to consider in a review (i.e. the first 50...)
max_num_words = 4000       # maximum number of words to include in the vocabulary (i.e. top 10000 only)

# Instantiate the Tokenizer
tokenizer = Tokenizer(num_words=max_num_words)

# Fit the tokenizer, i.e. learn the vocab and id the most frequently occuring words
tokenizer.fit_on_texts(df_all['articleBody'].values)

# Turn our texts to sequences of word indices
sequences = tokenizer.texts_to_sequences(df_all['articleBody'].values)

# Save the look-up dictionary for words to indices (will need this later)
word_index = tokenizer.word_index

# Pad out our sequences by prepending zeros to all of our text sequences
padded_sequences_articleBody = pad_sequences(sequences, maxlen=max_sequence_length)

print(f'Found {len(word_index)} unique tokens/words')
print(f'The maximum word index is {padded_sequences.max()}')

Found 27427 unique tokens/words
The maximum word index is 24999


In [80]:
padded_sequences_articleBody.shape

(49972, 200)

In [81]:
padded_sequences_articleBody[0,:]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       1961, 1800,    8,    1,  473, 2509, 1702,    8,   83,    2,  688,
         45,  528, 1102,  524, 2033,    5,  524,  381, 1542, 1961, 1800,
          8,    1,  473,  291,   10, 2300, 3475,  912,    6,   19, 1626,
        935,   42,    8,   40,    4,    1,  169, 2848,    5,  312,  133,
       1155,   17,   25,  317, 3047,    6,  186,   25,  115,  521,   64,
       1702,   20,   24,  145,  988,   29,    1, 3300,   87, 3533,    6,
        565, 2432,   87,   17,  314,   50,   10,    1,    6,   26,    5,
        729,    4,  996,   30,  110,   57,   23,   11,   18,  868,  564,
          1,  677,   32,    8,    2,  688,  381,    8,  142,    5,    1,
          4,   16,  645,  246,  132,    7, 1624,    8,    5,    3, 1772,
        132, 1800,   38,    8, 1287,    2, 2897, 30

In [82]:
padded_sequences = np.concatenate((padded_sequences_headline, padded_sequences_articleBody), axis=1)
padded_sequences.shape

(49972, 250)

In [83]:
df_all['stance_dummy'].shape

(49972,)

In [84]:
# Train, test split, baby...
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df_all['stance_dummy'], test_size=0.33, random_state=42)

X_train.shape, y_train.shape

((33481, 250), (33481,))

In [85]:
import numpy as np

glove_dir = './glove.6B'

embeddings_index = {}

with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print(f'Found {len(embeddings_index)} word vectors')

Found 400000 word vectors


In [86]:
embedding_dim = 100

embedding_matrix = np.zeros((max_num_words, embedding_dim))
for word, index in word_index.items():
    if index < max_num_words:
        # Using "get" with the dict is a safe way to avoid missing key errors
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [87]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(max_num_words, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(20))
model.add(Dense(1, activation='sigmoid'))

In [88]:
model.layers[0].set_weights([embedding_matrix])

In [89]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          400000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 20)                9680      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 409,701
Trainable params: 409,701
Non-trainable params: 0
_________________________________________________________________


In [94]:
history = model.fit(X_train, y_train, epochs=5, batch_size=300, validation_data=(X_test, y_test))

ValueError: Error when checking input: expected embedding_3_input to have shape (200,) but got array with shape (250,)

In [None]:
# What's baseline?# What's 
np.unique(y_train, return_counts=True)[1]/len(y_train)

In [None]:
X_test[0:1,:].shape

In [None]:
model.predict(X_test[0:1,:])

### body

In [None]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(df_all['articleBody'], size=100, window=5, min_count=1, workers=4)
body_w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [None]:
df_body_w2v = pd.DataFrame(body_w2v)
df_body_w2v.head()

## Model with word2vec articleBody + headline 

In [None]:
#from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit

# TRAIN_SET_PATH = "20ng-no-stop.txt"
# TRAIN_SET_PATH = "r52-all-terms.txt"
TRAIN_SET_PATH = "r8-no-stop.txt"

GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
GLOVE_840B_300D_PATH = "glove.840B.300d.txt"
encoding="utf-8"

In [None]:
ls = []
sentences = df_glove.split(".")
for i in sentences:
    ls.append(i.split())
model = Word2Vec(df_glove, min_count=1, size = 4)
words = list(model.wv.vocab)
print(words)
vectors = []
for word in words:
    vectors.append(model[word].tolist())
data = np.array(vectors)
data

In [None]:
df_w2v_fix = pd.concat([df_body_w2v, df_w2v], axis=1)
df_w2v_fix.head()

In [None]:
df_w2v_fix.shape

In [None]:
df_fix['Unnamed 0'].shape

In [None]:
X = df_w2v_fix
y = df_fix['Unnamed 0'] #need to create binary for classes #predict the prob of each class and highest use as predictions

X_train, X_test, y_train, y_test = train_test_split(X, y)