In [2]:
import numpy as np
import pandas as pd

import gensim.downloader as api
from gensim.models import LdaModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS, strip_punctuation, strip_short, stem_text, preprocess_string

In [3]:
info_datasets = api.info()
dataset_info = api.info("fake-news") 
dataset = api.load("fake-news")



In [5]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

list_of_text = []
for text in dataset:
    t = " ".join(w for w in nltk.wordpunct_tokenize(text['title']) \
         if w.lower() in words or not w.isalpha())
    if t.strip() == '':
        continue
    list_of_text.append(t)

[nltk_data] Downloading package words to
[nltk_data]     /Users/hasanenesguray/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [6]:
df = pd.DataFrame({'text':list_of_text}).reset_index()
df.head(2)

Unnamed: 0,index,text
0,0,BUSTED : They Stole Millions In ’ t
1,1,Re : Why Did Attorney General Lynch Plead The ...


In [7]:
def preprocess(text):
    
    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(), 
                                remove_stopwords, 
                                strip_punctuation, 
                                strip_short]
    text = preprocess_string(text, CUSTOM_FILTERS) 
    return text

# apply function to all reviews 
df['clean_text'] = df['text'].apply(lambda x: preprocess(x))
df = df[df['clean_text'].apply(lambda x:len(x)) != 0]
df.head(2)

Unnamed: 0,index,text,clean_text
0,0,BUSTED : They Stole Millions In ’ t,"[busted, stole, millions]"
1,1,Re : Why Did Attorney General Lynch Plead The ...,"[attorney, general, lynch, plead, fifth]"


In [8]:
corpus = df['clean_text']
dictionary = Dictionary(corpus)

In [9]:
# Create a bag-of-words representation of the text data
#print(word for word in text if word not in STOPWORDS)
bow_corpus = [dictionary.doc2bow(text) for text in corpus]

In [10]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics

num_of_topics = -1
max_coherence = -np.inf
for i in range(2,11):
    lsi = LsiModel(bow_corpus, num_topics=i, id2word=dictionary)
    
    coherence_model = CoherenceModel(model=lsi, texts=df['clean_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    if coherence_score > max_coherence:
        max_coherence = coherence_score
        num_of_topics = i
        
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.3394307202451518
Coherence score with 3 clusters: 0.37095705752747704
Coherence score with 4 clusters: 0.30514886763456617
Coherence score with 5 clusters: 0.3192822878254116
Coherence score with 6 clusters: 0.38925242341108657
Coherence score with 7 clusters: 0.3032831880957911
Coherence score with 8 clusters: 0.32978931458353294
Coherence score with 9 clusters: 0.4156364066494435
Coherence score with 10 clusters: 0.3884930051810219


In [11]:
print(num_of_topics)

9


In [12]:
# perform SVD on the bag of words with the LsiModel to extract 5 topics
lsi = LsiModel(bow_corpus, num_topics=num_of_topics, id2word=dictionary)

In [13]:
for topic_num, words in lsi.print_topics(num_words=num_of_topics):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.967*"trump" + 0.103*"video" + 0.091*"election" + 0.073*"new" + 0.062*"news" + 0.054*"president" + 0.043*"world" + 0.041*"russia" + 0.039*"breaking".
Words in 1: 0.614*"election" + 0.351*"new" + 0.350*"news" + 0.251*"russia" + 0.248*"war" + 0.211*"world" + 0.200*"2016" + -0.191*"trump" + 0.153*"video".
Words in 2: 0.702*"election" + -0.398*"new" + -0.324*"war" + -0.273*"russia" + -0.254*"world" + -0.220*"news" + 0.097*"day" + -0.078*"comment" + 0.043*"presidential".
Words in 3: -0.752*"new" + 0.402*"news" + 0.328*"russia" + 0.222*"war" + -0.134*"video" + -0.112*"comment" + 0.111*"world" + 0.105*"source" + 0.103*"onion".
Words in 4: 0.618*"news" + -0.563*"war" + -0.370*"world" + 0.196*"source" + 0.193*"onion" + 0.122*"new" + -0.119*"election" + 0.096*"breaking" + -0.083*"2016".
Words in 5: -0.944*"video" + 0.191*"new" + 0.157*"election" + 0.081*"trump" + -0.056*"2016" + 0.054*"news" + -0.053*"watch" + -0.046*"comment" + -0.044*"black".
Words in 6: 0.648*"russia" + -0.614*"c

In [14]:
corpus_lsi = lsi[bow_corpus]
from tqdm.notebook import tqdm
corpus_lsi = lsi[bow_corpus]
topic = []

for doc in tqdm(corpus_lsi):
    try:
        topic.append((np.array([(round(doc[i][1],2)) for i in range(num_of_topics)]).argmax()))
    except:
        topic.append(-1)

print(np.unique(np.array(topic)))
# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['text']
df_topic['Topic']= topic
df_topic.head(5)

  0%|          | 0/11822 [00:00<?, ?it/s]

[-1  0  1  2  3  4  6  7  8]


Unnamed: 0,Text,Topic
0,BUSTED : They Stole Millions In ’ t,0
1,Re : Why Did Attorney General Lynch Plead The ...,1
2,BREAKING : With On Investigation,1
3,PIN DROP SPEECH BY FATHER OF DAUGHTER And By :...,0
4,FANTASTIC ! TRUMP ' S 7 POINT PLAN To Reform W...,0


In [15]:
# find a sample review from each topic
for i in range(-1, num_of_topics):
    df_topic0 = df_topic[df_topic['Topic'] == i]
    print(df_topic0.shape)
    if df_topic0.shape[0] == 0:
        continue
    print('Sample text from topic {}:\n {}'.format(i, df_topic0.sample(1)['Text'].values))

(68, 2)
Sample text from topic -1:
 [': 44 الـ24 -']
(4636, 2)
Sample text from topic 0:
 [': Manipulate']
(4063, 2)
Sample text from topic 1:
 ['The Media Got Into The ’ s Bogus Tale']
(331, 2)
Sample text from topic 2:
 ['Federal Will Be Watching You Vote on Election Day']
(80, 2)
Sample text from topic 3:
 ['are accused of breaking military oath - Russia News Now']
(387, 2)
Sample text from topic 4:
 ["Beauty Queen Told To Lose Weight Quits Pageant - The Onion - ' s News Source"]
(0, 2)
(270, 2)
Sample text from topic 6:
 ["' Lost ' in the as for Alliance with Russia and China -"]
(1642, 2)
Sample text from topic 7:
 ['“ I Can ’ t Believe I ’ m Doing this ’ — Man Himself Stealing a Cop Car on']
(345, 2)
Sample text from topic 8:
 ['It ’ s Not Over Yet : “ They May Be Trying To Steal And Michigan ” | For Electoral College To Ignore Will Of People']


In [16]:
# find the coherence score with a different number of topics

num_of_topics = -1
max_coherence = -np.inf
for i in range(2,11):
    lda_model = LdaMulticore(corpus=bow_corpus, id2word=dictionary, num_topics=i, passes=4)
    coherence_model = CoherenceModel(model=lda_model, texts=df['clean_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    if coherence_score > max_coherence:
        max_coherence = coherence_score
        num_of_topics = i
        
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.2573091821538854
Coherence score with 3 clusters: 0.23459051792096172
Coherence score with 4 clusters: 0.3132240726157674
Coherence score with 5 clusters: 0.3232233126732518
Coherence score with 6 clusters: 0.3086890497985302
Coherence score with 7 clusters: 0.29165481099933294
Coherence score with 8 clusters: 0.3329460794491356
Coherence score with 9 clusters: 0.33039455846244464
Coherence score with 10 clusters: 0.3301432552510535


In [17]:
print(num_of_topics)

8


In [18]:
lda_model = LdaMulticore(corpus=bow_corpus, id2word=dictionary, num_topics=num_of_topics, passes=4)

In [19]:
for topic_num, words in lda_model.print_topics(num_words=num_of_topics):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.018*"trump" + 0.016*"video" + 0.014*"world" + 0.013*"new" + 0.012*"war" + 0.011*"news" + 0.009*"investigation" + 0.007*"coming".
Words in 1: 0.035*"trump" + 0.018*"campaign" + 0.015*"new" + 0.011*"news" + 0.008*"breaking" + 0.007*"source" + 0.007*"onion" + 0.006*"world".
Words in 2: 0.070*"trump" + 0.030*"election" + 0.013*"2016" + 0.013*"president" + 0.007*"media" + 0.006*"state" + 0.006*"day" + 0.005*"win".
Words in 3: 0.027*"trump" + 0.016*"comment" + 0.012*"election" + 0.009*"black" + 0.007*"rock" + 0.006*"standing" + 0.006*"2016" + 0.006*"day".
Words in 4: 0.026*"trump" + 0.016*"war" + 0.010*"new" + 0.006*"white" + 0.006*"man" + 0.006*"news" + 0.006*"money" + 0.006*"truth".
Words in 5: 0.027*"trump" + 0.017*"election" + 0.015*"video" + 0.013*"vote" + 0.008*"voting" + 0.007*"government" + 0.007*"war" + 0.007*"news".
Words in 6: 0.017*"new" + 0.017*"comment" + 0.012*"news" + 0.010*"world" + 0.009*"people" + 0.008*"information" + 0.007*"war" + 0.006*"russia".
Words in 7

In [20]:
from tqdm.notebook import tqdm
corpus_lda = lda_model[bow_corpus]
topic = []

for doc in tqdm(corpus_lda):
    try:
        topic.append((np.array([(round(doc[i][1],2)) for i in range(num_of_topics)]).argmax()))
    except:
        topic.append(-1)

print(np.unique(np.array(topic)))
# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['text']
df_topic['Topic']= topic
df_topic.head(5)

  0%|          | 0/11822 [00:00<?, ?it/s]

[-1  0  1  2  3  4  5  6  7]


Unnamed: 0,Text,Topic
0,BUSTED : They Stole Millions In ’ t,0
1,Re : Why Did Attorney General Lynch Plead The ...,1
2,BREAKING : With On Investigation,0
3,PIN DROP SPEECH BY FATHER OF DAUGHTER And By :...,3
4,FANTASTIC ! TRUMP ' S 7 POINT PLAN To Reform W...,2


In [21]:
# find a sample review from each topic
for i in range(-1, num_of_topics):
    df_topic0 = df_topic[df_topic['Topic'] == i]
    print(df_topic0.shape)
    if df_topic0.shape[0] == 0:
        continue
    print('Sample text from topic {}:\n {}'.format(i, df_topic0.sample(1)['Text'].values))

(93, 2)
Sample text from topic -1:
 ['. Willie : ‘ The Western central bank franchise system is totally broken , totally insolvent , and totally corrupt ’… : ‘ If debt to increase at twice the rate of , financial engineering can only smooth over for so long .’']
(1346, 2)
Sample text from topic 0:
 ['What If I Told You Is Great for Reversing ’ s ?']
(1458, 2)
Sample text from topic 1:
 ['had every right to take over US embassy | Today']
(1727, 2)
Sample text from topic 2:
 ['Review : Brit ’ s “ The ”']
(1505, 2)
Sample text from topic 3:
 ['Kim Could Pay As Much As $ 225 , 000 In Legal For Her Publicity Stunt']
(1469, 2)
Sample text from topic 4:
 ['The Brain From Traumatic And , Study']
(1360, 2)
Sample text from topic 5:
 ["Are White And Forcing Them Into Sex Slavery , ' s Charity"]
(1365, 2)
Sample text from topic 6:
 ['Re : Don ’ t CLICK that , stupid ! Is this from March 2016 where # PodestaEmails21 and ?']
(1499, 2)
Sample text from topic 7:
 ['Re : End Time Persecution Is Here :