In [6]:
# LDA for topic modeling
# https://stackabuse.com/python-for-nlp-topic-modeling
import pandas as pd
import numpy as np
import All_Functions as af

reviews_datasets = pd.read_csv('Reviews.csv')
reviews_datasets = reviews_datasets.head(20000)
reviews_datasets.dropna()
reviews_datasets.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
# My test
vegas_df = af.combine_months_to_df('newspaper-text\Vegas_Text-first-month.csv','newspaper-text\Vegas_Text-second-month.csv','newspaper-text\Vegas_Text-third-month.csv')

vegas_df.head()

Unnamed: 0.1,Unnamed: 0,url,text,sentiment,publish_date,title,themes,media_id,media_url
0,0,https://www.realclearpolitics.com/articles/201...,las vegas ap rapid-fire popping sounded like f...,"{'neg': 0.212, 'neu': 0.727, 'pos': 0.061, 'co...",2017-10-01 20:00:00,At Least 50 Killed as Gunman Opens Fire at Las...,,1040,http://realclearpolitics.com/
1,1,http://www.marketwatch.com/news/story.asp?guid...,shares gun makers rallied monday wake describe...,"{'neg': 0.117, 'neu': 0.749, 'pos': 0.133, 'co...",2017-10-02 08:35:09,Gun-maker stocks surge after mass shooting in ...,,1150,https://www.wsj.com/
2,2,http://feedproxy.google.com/~r/newsy-allvideos...,least 58 people dead 500 injured gunman opened...,"{'neg': 0.284, 'neu': 0.659, 'pos': 0.057, 'co...",2017-10-02 07:47:00,"At Least 50 Dead, 400 Injured After Las Vegas ...",,85364,http://www.newsy.com/#spider
3,3,http://feedproxy.google.com/~r/time/topstories...,information spreads quickly mass shootings peo...,"{'neg': 0.255, 'neu': 0.687, 'pos': 0.059, 'co...",2017-10-02 10:51:55,Beware of These Hoaxes Being Spread About the ...,,40362,http://www.time.com/time/
4,4,http://www.eastbaytimes.com/2017/10/02/watch-l...,trending president donald trump spoke morning ...,"{'neg': 0.154, 'neu': 0.789, 'pos': 0.057, 'co...",2017-10-02 10:35:10,Watch: President Trump speaks about Las Vegas ...,,27512,http://www.ibabuzz.com/insider/


In [8]:
# create a vocabulary of all the words in our data 
from sklearn.feature_extraction.text import CountVectorizer
'''
We're using CountVectorizer to create a document-term matrix 
The paramters mean that we're only including words that appear in less than 80% of the document and in at least 2 documents
Also removing stopwords
'''
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(reviews_datasets['Text'].values.astype('U'))

In [10]:
# My test 
vegas_count_vect = CountVectorizer(max_df=0.8, min_df=2)
vegas_doc_term_matrix =  vegas_count_vect.fit_transform(vegas_df['text'].values.astype('U'))

In [5]:
doc_term_matrix
'''
This means that each of the 20000 documents is represented as a vector with 
1456 dimension - so our vocabulary has 14546 words'''

<20000x14546 sparse matrix of type '<class 'numpy.int64'>'
	with 594703 stored elements in Compressed Sparse Row format>

In [11]:
# My test
vegas_doc_term_matrix

<1820x22025 sparse matrix of type '<class 'numpy.int64'>'
	with 563274 stored elements in Compressed Sparse Row format>

In [12]:
# use LDA to create topics along the probability distribution for each word in our vocab for each topic
from sklearn.decomposition import LatentDirichletAllocation
'''
n_components specifics the number of topics that we want our text to be divided into
random_state is the seed so you can replicate your results 
'''

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [18]:
# My test 
vegas_LDA = LatentDirichletAllocation(n_components=5,random_state=42)
vegas_LDA.fit(vegas_doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [19]:
# randomly fetch 10 words from the vocab
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

rivalries
charity
singapore
consist
stein
worsham
romanucci
riskier
publicists
rough


In [20]:
# get the 10 words with the highest probability for the first topic
first_topic = LDA.components_[0]
""" ^contains the probabilities of 14546 words for topic 1 """
top_topic_words = first_topic.argsort()[-10:]
'''^ contains indexes of the 10 words with the highest probabilities '''
top_topic_words

array([14106,  5892,  7088,  4290, 12596,  5771,  5187, 12888,  7498,
       12921], dtype=int64)

In [21]:
# use the indexes to retreive the words from the count_vect object
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

parity
dido
espn
compares
mingled
despite
current
motorist
fall
movies


In [22]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['parity', 'dido', 'espn', 'compares', 'mingled', 'despite', 'current', 'motorist', 'fall', 'movies']


Top 10 words for topic #1:
['atlantis', 'bitcoin', 'ferocious', 'current', 'blackhawks', 'espn', 'dido', 'motorist', 'despite', 'fall']


Top 10 words for topic #2:
['espn', 'compares', 'gulflive', 'mingled', 'maintains', 'parity', 'fall', 'erstwhile', 'increasing', 'atlantis']


Top 10 words for topic #3:
['derpity', 'concocted', 'decades', 'increasing', 'fall', 'colossally', 'north', 'color', 'atlantis', 'dao']


Top 10 words for topic #4:
['casualties', 'inaccurate', 'dido', 'fall', 'accord', 'despite', 'atlantis', 'increasing', 'castillo', 'bouts']




In [23]:
# My test
for i,topic in enumerate(vegas_LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vegas_count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['team', 'night', 'season', 'could', 'use', 'vegas', 'new', 'game', 'said', 'first']


Top 10 words for topic #1:
['216', 'trump', 'mobile', 'arena', '27', 'new', 'round', 'first', 'fight', 'ufc']


Top 10 words for topic #2:
['sunday', 'photo', 'oct', 'people', 'paddock', 'music', '2017', 'festival', 'las', 'vegas']


Top 10 words for topic #3:
['also', 'news', 'time', 'like', 'one', 'would', 'trump', 'people', 'said', 'gun']


Top 10 words for topic #4:
['killed', 'kelley', 'sunday', 'first', 'sutherland', 'springs', 'people', 'texas', 'church', 'said']




In [24]:
topic_values = LDA.transform(doc_term_matrix)
'''output means that each of the documents have 5 columns where each column 
corresponds to the probabilitiy value of a particular topic'''
topic_values.shape

(20000, 5)

In [25]:
reviews_datasets['Topic'] = topic_values.argmax(axis=1)
reviews_datasets.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Topic
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,3
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1


Following this guide: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24


In [1]:
import pandas as pd
import All_Functions as af


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khahn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [18]:
vegas_df = af.combine_months_to_df('newspaper-text\Vegas_Text-first-month.csv','newspaper-text\Vegas_Text-second-month.csv','newspaper-text\Vegas_Text-third-month.csv')

In [19]:
def split_string(text):
    return text.split(" ")

In [20]:
processed_text = vegas_df['text'].map(split_string)

In [25]:
dictionary = gensim.corpora.Dictionary(processed_text)

# count = 0
# for k, v in dictionary.iteritems():
#     print(k, v)
#     count += 1
#     if count > 100:
#         break
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [30]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text]

In [32]:
# bow_doc_1800 = bow_corpus[1800]
# for i in range(len(bow_doc_1800)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_1800[i][0], 
#                                                dictionary[bow_doc_1800[i][0]], 
# bow_doc_1800[i][1]))

In [34]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
# from pprint import pprint
# for doc in corpus_tfidf:
#     pprint(doc)
#     break

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [36]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"gun" + 0.006*"use" + 0.005*"would" + 0.005*"2017" + 0.004*"trump" + 0.004*"''" + 0.003*"october" + 0.003*"may" + 0.003*"paddock" + 0.003*"new"
Topic: 1 
Words: 0.005*"fight" + 0.005*"two" + 0.005*"could" + 0.004*"night" + 0.004*"would" + 0.004*"new" + 0.004*"win" + 0.004*"t-mobile" + 0.004*"left" + 0.004*"killed"
Topic: 2 
Words: 0.010*"church" + 0.009*"texas" + 0.008*"''" + 0.008*"kelley" + 0.006*"gunman" + 0.006*"killed" + 0.006*"sunday" + 0.005*"gun" + 0.005*"2017" + 0.004*"springs"
Topic: 3 
Words: 0.007*"paddock" + 0.006*"''" + 0.006*"festival" + 0.005*"like" + 0.005*"2017" + 0.005*"police" + 0.004*"music" + 0.004*"told" + 0.004*"stephen" + 0.004*"gun"
Topic: 4 
Words: 0.014*"''" + 0.008*"trump" + 0.005*"gun" + 0.005*"new" + 0.004*"paddock" + 0.004*"would" + 0.004*"year" + 0.004*"president" + 0.003*"us" + 0.003*"like"
Topic: 5 
Words: 0.012*"gun" + 0.009*"''" + 0.006*"would" + 0.005*"new" + 0.004*"like" + 0.004*"trump" + 0.004*"guns" + 0.004*"--" + 0.003*"h