# News Article and Title NLP and LDA

# 1. Natural Language Processing
Adapted NLP portions from https://github.com/GarrettHoffman/digital_media_shares_optimization

In [79]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [42]:
# import data
inputData = pd.read_csv('HWNLP.csv')

In [43]:
inputData.shape

(141, 18)

In [44]:
# check length of final data set
len(inputData)

141

In [45]:
inputData.head()

Unnamed: 0,ResearchUpdate,NewsletterMonth,NewsletterYear,NewsletterIssueDate,NewsletterIssueTime,NewsletterIssueAmPm,Link,Title,ArticleText,Images,NumberImages,Tools,NumberTools,Topic1,Topic2,Extra topics,headline,content
0,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,Long-Term Energy Development Has Negative Impa...,This paper demonstrates that when fossil fuel ...,1,3,1,1,Best Practices,Trends & Performance,,long term energy development negative impact c...,paper demonstrate fossil fuel development play...
1,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/dataviz/west-w...,New Atlas Shows Diversifying Western Economy,"Download socioeconomic reports of communities,...",0,0,1,1,,,,new atlas show diversify western economy,download socioeconomic report community county...
2,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/c...,Winners & Losers from Proposed County Payments...,This report compares how North Dakota provides...,1,3,1,1,Best Practices,,,winner loser propose county payment reform,report compare north dakota provide local gove...
3,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,North Dakota Not Returning Adequate Oil Revenu...,FEBRUARY 2014 / SERIES: STATE ENERGY POLICIES\...,1,2,1,1,Best Practices,,,north dakota return adequate oil revenue local...,february series state energy policy report com...
4,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/s...,Owyhee Canyonlands Hold Economic Potential,This report provides an initial analysis of th...,1,1,1,1,Local Studies,pecific Places,,owyhee canyonlands hold economic potential,report provide initial analysis potential econ...


# feature engineering

In [46]:
from __future__ import division
import numpy as np
import pymongo
import nltk
from textblob import TextBlob
import string
from nltk.corpus import stopwords
from textstat.textstat import textstat

In [47]:
stop = stopwords.words('english')

def engineer_NLP_features(inputData):

    """
    Generate NLP fatures (related to language and sentiment)
    for articles to be used in predicting no. of 
    reads

    Arguments:
    inputData: document contating article content data

    Output:
    Stores NLP features results for Document
    """

    # get article headline and article content 

    headline = str(inputData['Title'])
    content = str(inputData['ArticleText'])

    # generate headline features

    # number of words in title
    n_tokens_title = len(headline.split())

    # subjectivity
    title_subjectivity = TextBlob(headline).subjectivity

    # polarity
    title_sentiment_polarity = TextBlob(headline).polarity

    # absolute value polarirty
    title_sentiment_abs_polarity = abs(title_sentiment_polarity)

    # average word length
    average_token_length_title = np.mean([len(w) for w 
                                          in "".join(c for c in headline 
                                                     if c not in string.punctuation).split()])

    #generate content features

    # number of words
    n_tokens_content = len([w for w in content.split()])

    # rate of unique words
    r_unique_tokens = len(set([w.lower()
                               for w 
                               in "".join(c for c in content 
                                          if c not in string.punctuation).split()]))/n_tokens_content

    # rate of non-stop word
    r_non_stop_words = len([w.lower() 
                            for w in "".join(c for c in content 
                                             if c not in string.punctuation).split() 
                            if w 
                            not in stop])/n_tokens_content

    # rate of unique non-stop word
    r_non_stop_unique_tokens = len(set([w.lower() 
                               for w in "".join(c for c in content 
                                                if c not in string.punctuation).split() 
                               if w
                               not in stop]))/n_tokens_content

    # average word length
    average_token_length_content = np.mean([len(w) for w 
                                            in "".join(c for c in content
                                                       if c not in string.punctuation).split()])

    # subjectivity
    global_subjectivity = TextBlob(content).subjectivity

    # polarity
    global_sentiment_polarity = TextBlob(content).polarity

    # absolute polarity
    global_sentiment_abs_polarity = abs(global_sentiment_polarity)

    # get polarity by word
    polarity_list = [(w, TextBlob(w).polarity) 
                     for w in "".join(c for c in content 
                                      if c not in string.punctuation).split()]

    # global positive word rate
    global_rate_positive_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p > 0])/len(polarity_list)

    # global negative word rate
    global_rate_negative_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p < 0])/len(polarity_list)

    # positive word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_positive_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p > 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])
    else:
        rate_positive_words = 0

    # negative word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_negative_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p < 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])

    else:
       rate_negative_words = 0 

    # average polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        avg_positive_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p > 0])
    else:
        avg_positive_polarity = 0

    # minimum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        min_positive_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else:
        min_positive_polarity = 0

    # maximum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        max_positive_polarity = max([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else: 
        max_positive_polarity = 0

    # average polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        avg_negative_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p < 0])
    else:
        avg_negative_polarity = 0

    # minimum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        min_negative_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
    else:
        min_negative_polarity = 0

    # maximum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        max_negative_polarity = max([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
    else:
        max_negative_polarity = 0

    # abs maximum polarity, sum of abs of max positive and abs of min negative polarity
    max_abs_polarity = max_positive_polarity + abs(min_negative_polarity)

    # Flesch Reading Ease
    global_reading_ease = textstat.flesch_reading_ease(content)

    # Flesch Kincaid Grade Level
    global_grade_level = textstat.flesch_kincaid_grade(content)

    res = {"headline": inputData["Title"],
            "content": inputData["ArticleText"],
            "n_tokens_title": n_tokens_title, 
                                    "title_subjectivity": title_subjectivity,
                                    "title_sentiment_polarity": title_sentiment_polarity,
                                    "title_sentiment_abs_polarity": title_sentiment_abs_polarity,
                                    "average_token_length_title": average_token_length_title,
                                    "n_tokens_content": n_tokens_content,
                                    "r_unique_tokens": r_unique_tokens,
                                    "r_non_stop_words": r_non_stop_words,
                                    "r_non_stop_unique_tokens": r_non_stop_unique_tokens,
                                    "average_token_length_content": average_token_length_content,
                                    "global_subjectivity": global_subjectivity,
                                    "global_sentiment_polarity": global_sentiment_polarity,
                                    "global_sentiment_abs_polarity": global_sentiment_abs_polarity,
                                    "global_rate_positive_words": global_rate_positive_words,
                                    "global_rate_negative_words": global_rate_negative_words,
                                    "rate_positive_words": rate_positive_words,
                                    "rate_negative_words": rate_negative_words,
                                    "avg_positive_polarity": avg_positive_polarity,
                                    "min_positive_polarity": min_positive_polarity,
                                    "max_positive_polarity": max_positive_polarity,
                                    "avg_negative_polarity": avg_negative_polarity,
                                    "min_negative_polarity": min_negative_polarity,
                                    "max_negative_polarity": max_negative_polarity,
                                    "max_abs_polarity": max_abs_polarity,
                                    "global_reading_ease": global_reading_ease,
                                    "global_grade_level": global_grade_level}
    
    return(pd.DataFrame.from_dict(res, orient = 'index').transpose())

In [48]:
res = []

for index, row in inputData.head(142).iterrows():
    
    #print(row['ArticleText'])
    
    if index is 0:
        res = engineer_NLP_features(row)
    else:
        res = res.append(engineer_NLP_features(row))


print(res)

                                             headline  \
0   Long-Term Energy Development Has Negative Impa...   
0        New Atlas Shows Diversifying Western Economy   
0   Winners & Losers from Proposed County Payments...   
0   North Dakota Not Returning Adequate Oil Revenu...   
0          Owyhee Canyonlands Hold Economic Potential   
0                  Great Lakes Climate Change Impacts   
0       Determining Vulnerabilities to Climate Change   
0          Understanding Income from National Forests   
0                 Why State Energy Tax Policy Matters   
0   Migration & Population Trends in the West Vary...   
0   The Tongass Transition Framework: A New Path F...   
0            Time to Create a Natural Resources Trust   
0                            County Payments Research   
0               Reducing Wildfire Risk to Communities   
0   How Is Fracking Shaping Your Community and Eco...   
0   Lessons for Wildfire from Federal Flood Risk M...   
0   Benefits of Renewable Energ

In [9]:
res.to_csv('res_dataset731.csv')

# 2. LDA model features

In [92]:
import numpy as np
import pandas as pd
import re
import time
import nltk

In [93]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kristi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [95]:
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaModel

In [96]:
import pyLDAvis
import pyLDAvis.gensim

In [97]:
df = pd.read_csv('HWNLP.csv')

In [98]:
df.head()

Unnamed: 0,ResearchUpdate,NewsletterMonth,NewsletterYear,NewsletterIssueDate,NewsletterIssueTime,NewsletterIssueAmPm,Link,Title,ArticleText,Images,NumberImages,Tools,NumberTools,Topic1,Topic2,Extra topics,headline,content
0,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,Long-Term Energy Development Has Negative Impa...,This paper demonstrates that when fossil fuel ...,1,3,1,1,Best Practices,Trends & Performance,,long term energy development negative impact c...,paper demonstrate fossil fuel development play...
1,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/dataviz/west-w...,New Atlas Shows Diversifying Western Economy,"Download socioeconomic reports of communities,...",0,0,1,1,,,,new atlas show diversify western economy,download socioeconomic report community county...
2,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/c...,Winners & Losers from Proposed County Payments...,This report compares how North Dakota provides...,1,3,1,1,Best Practices,,,winner loser propose county payment reform,report compare north dakota provide local gove...
3,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,North Dakota Not Returning Adequate Oil Revenu...,FEBRUARY 2014 / SERIES: STATE ENERGY POLICIES\...,1,2,1,1,Best Practices,,,north dakota return adequate oil revenue local...,february series state energy policy report com...
4,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/s...,Owyhee Canyonlands Hold Economic Potential,This report provides an initial analysis of th...,1,1,1,1,Local Studies,pecific Places,,owyhee canyonlands hold economic potential,report provide initial analysis potential econ...


In [99]:
df.shape

(141, 18)

# Clean the reviews
Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma .

In [100]:
%%time
wordnet_lemmatizer = WordNetLemmatizer()
stopset = list(set(stopwords.words('english')))
clean_reviews_text = []
for review in df['ArticleText']:  # Loop through the tokens (the words or symbols) in each review. 
    try:    
        cleaned_review = re.sub("[^a-zA-Z]"," ", review)  # Remove numbers and punctuation.
        cleaned_review = cleaned_review.lower()  # Convert the text to lower case.
        cleaned_review = ' '.join([word for word in cleaned_review.split() if word not in stopset])  # Keep only words that are not stopwords.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='n') for word in cleaned_review.split()])  # Keep each noun's lemma.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in cleaned_review.split()])  # Keep each verb's lemma.
        cleaned_review = re.sub(r"(http\S+)"," ", cleaned_review)  # Remove http links.
        cleaned_review = ' '.join(cleaned_review.split())  # Remove white space.
    except TypeError:
        pass
    clean_reviews_text.append(cleaned_review)

CPU times: user 958 ms, sys: 10.3 ms, total: 969 ms
Wall time: 1.12 s


In [101]:
df['cleanText'] = clean_reviews_text
df.head()

Unnamed: 0,ResearchUpdate,NewsletterMonth,NewsletterYear,NewsletterIssueDate,NewsletterIssueTime,NewsletterIssueAmPm,Link,Title,ArticleText,Images,NumberImages,Tools,NumberTools,Topic1,Topic2,Extra topics,headline,content,cleanText
0,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,Long-Term Energy Development Has Negative Impa...,This paper demonstrates that when fossil fuel ...,1,3,1,1,Best Practices,Trends & Performance,,long term energy development negative impact c...,paper demonstrate fossil fuel development play...,paper demonstrate fossil fuel development play...
1,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/dataviz/west-w...,New Atlas Shows Diversifying Western Economy,"Download socioeconomic reports of communities,...",0,0,1,1,,,,new atlas show diversify western economy,download socioeconomic report community county...,download socioeconomic report community county...
2,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/c...,Winners & Losers from Proposed County Payments...,This report compares how North Dakota provides...,1,3,1,1,Best Practices,,,winner loser propose county payment reform,report compare north dakota provide local gove...,report compare north dakota provide local gove...
3,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,North Dakota Not Returning Adequate Oil Revenu...,FEBRUARY 2014 / SERIES: STATE ENERGY POLICIES\...,1,2,1,1,Best Practices,,,north dakota return adequate oil revenue local...,february series state energy policy report com...,february series state energy policy report com...
4,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/s...,Owyhee Canyonlands Hold Economic Potential,This report provides an initial analysis of th...,1,1,1,1,Local Studies,pecific Places,,owyhee canyonlands hold economic potential,report provide initial analysis potential econ...,report provide initial analysis potential econ...


In [102]:
print('ORIGINAL: ' + df.iloc[0]['ArticleText'])
print(' ')
print('CLEANED: ' + df.iloc[0]['cleanText'])

ORIGINAL: This paper demonstrates that when fossil fuel development plays a prominent, long-term role in local western economies there are negative effects on per capita income, crime rates, and educational attainment.

For counties that participated in the early 1980s oil and natural gas boom, per capita income declines with longer industry specialization.
The longer the duration of oil and gas specialization in the county, the higher the crime rate.
For counties that participated in the early 1980s oil and gas boom, educational attainment declines with longer specialization.
maps of long-term impacts of economic dependence on fossil fuel

Prolonged oil and natural gas specialization leads to lower per capita income, more crime, and less educational attainment

This paper (PDF), “Oil and Gas Extraction as an Economic Development Strategy,” and an executive summary (PDF) call into question the assumption that long-term oil and gas development confers a clear economic advantage on host 

In [103]:
df['cleanText'].to_csv('cleanText.csv')

# Perform semantic analysis using LDA.
Preprocess the reviews by creating a dictionary of words used and a bag-of-words corpus. Note that each of the steps below takes several minutes.

In [104]:
%%time
tokens_by_doc = [review.split() for review in clean_reviews_text]
dictionary = corpora.Dictionary(tokens_by_doc)
bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokens_by_doc]

CPU times: user 143 ms, sys: 9.43 ms, total: 152 ms
Wall time: 167 ms


In [105]:
%%time
lda_model = LdaModel(bow_corpus, num_topics=10, id2word=dictionary, random_state=201)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

CPU times: user 1.04 s, sys: 104 ms, total: 1.14 s
Wall time: 765 ms


In [106]:
%%time
topics_list = []
for doc_topics in lda_model.get_document_topics(bow_corpus):
    topics_list.append(sorted(doc_topics, key=lambda doc: -doc[1])[0][0])

CPU times: user 288 ms, sys: 24.6 ms, total: 313 ms
Wall time: 192 ms


In [107]:
lda_model.get_document_topics(bow_corpus[0])

[(1, 0.687926), (4, 0.27515444), (5, 0.036141668)]

In [108]:
df['Top Topic'] = topics_list
df.head()

Unnamed: 0,ResearchUpdate,NewsletterMonth,NewsletterYear,NewsletterIssueDate,NewsletterIssueTime,NewsletterIssueAmPm,Link,Title,ArticleText,Images,NumberImages,Tools,NumberTools,Topic1,Topic2,Extra topics,headline,content,cleanText,Top Topic
0,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,Long-Term Energy Development Has Negative Impa...,This paper demonstrates that when fossil fuel ...,1,3,1,1,Best Practices,Trends & Performance,,long term energy development negative impact c...,paper demonstrate fossil fuel development play...,paper demonstrate fossil fuel development play...,1
1,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/dataviz/west-w...,New Atlas Shows Diversifying Western Economy,"Download socioeconomic reports of communities,...",0,0,1,1,,,,new atlas show diversify western economy,download socioeconomic report community county...,download socioeconomic report community county...,5
2,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/c...,Winners & Losers from Proposed County Payments...,This report compares how North Dakota provides...,1,3,1,1,Best Practices,,,winner loser propose county payment reform,report compare north dakota provide local gove...,report compare north dakota provide local gove...,5
3,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/energy/best-pr...,North Dakota Not Returning Adequate Oil Revenu...,FEBRUARY 2014 / SERIES: STATE ENERGY POLICIES\...,1,2,1,1,Best Practices,,,north dakota return adequate oil revenue local...,february series state energy policy report com...,february series state energy policy report com...,5
4,0,December,2013,12/10/13,11:41:58,PM,https://headwaterseconomics.org/public-lands/s...,Owyhee Canyonlands Hold Economic Potential,This report provides an initial analysis of th...,1,1,1,1,Local Studies,pecific Places,,owyhee canyonlands hold economic potential,report provide initial analysis potential econ...,report provide initial analysis potential econ...,4


In [109]:
lda_model.show_topic(topicid=5, topn=20)

[('revenue', 0.01336116),
 ('tax', 0.012346165),
 ('community', 0.0119744325),
 ('state', 0.011365761),
 ('oil', 0.010931766),
 ('production', 0.009692416),
 ('economic', 0.008466131),
 ('local', 0.007773456),
 ('use', 0.0075841295),
 ('county', 0.0075266883),
 ('wildfire', 0.0063531892),
 ('cost', 0.0060999445),
 ('government', 0.0057153883),
 ('income', 0.0055841515),
 ('coal', 0.0055741877),
 ('unconventional', 0.0054597114),
 ('federal', 0.005338116),
 ('fire', 0.0053043338),
 ('impact', 0.0052453214),
 ('report', 0.0052178716)]

# Visualize your LDA results
Preparing the visualization will take several minutes

In [110]:
%%time
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


CPU times: user 636 ms, sys: 199 ms, total: 835 ms
Wall time: 4.83 s


In [111]:
sort=False

In [112]:
%%time
pyLDAvis.display(LDAvis_prepared)

CPU times: user 9.78 ms, sys: 1.27 ms, total: 11.1 ms
Wall time: 11.7 ms


In [113]:
import pandas as pd

mixture = [dict(lda_model[x]) for x in bow_corpus]
pd.DataFrame(mixture).to_csv("topic_mixture731.csv")

In [114]:
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 5)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words731.csv")