## Topic modeling using small policy dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

In [2]:
# Create small dataframe with customer feedback
text_train = pd.DataFrame({
    "policy_views": ["I would like more funding for pollution mitigation",
                "Environmental regulation and reducing pollution is important to me",
                "Minimum wage and raising living standards",
                "Wages are so low and they need to go up whether by minimum wage increases or collective bargaining",
               "Climate change and environmental degredation are my main focus",
               "Investing in renewable fuels and environmental regulation",
               "Minimum wage and climate change, environmental policy",
               "environment, climate chamge global warming, solar power",
               "Increase federal minimum wage to a livable level to ensure nobody is in poverty"]
})

In [3]:
text_train

Unnamed: 0,policy_views
0,I would like more funding for pollution mitiga...
1,Environmental regulation and reducing pollutio...
2,Minimum wage and raising living standards
3,Wages are so low and they need to go up whethe...
4,Climate change and environmental degredation a...
5,Investing in renewable fuels and environmental...
6,"Minimum wage and climate change, environmental..."
7,"environment, climate chamge global warming, so..."
8,Increase federal minimum wage to a livable lev...


In [8]:
# Steps:
    # 1: Instantiate vectorizer with parameters: vec
    # 2: Vectorize column of text with fit_transform: X
    # 3: Instantiate LDA model with parameters: lda
    # 4: Fit LDA model to vectorized text: doc_topics

In [4]:
# 1: create vectorizer
vec = CountVectorizer(stop_words="english")

In [5]:
# 2: create dtm
X = vec.fit_transform(text_train["policy_views"])

In [6]:
# 3 create lda
lda = LatentDirichletAllocation(n_components=2)

In [7]:
# 4 fit lda
doc_topics = lda.fit_transform(X)

In [8]:
# 5
print(f"There are {lda.components_.shape[0]} topics and {lda.components_.shape[1]} words")

There are 2 topics and 40 words


### Extract top words from each topic

In [9]:
## Get feature names (vocabulary)
voc = np.array(vec.get_feature_names())

In [10]:
# Set number of top words you want
n_words=5

# Create lambda function to extra top words from voc
imp_words = lambda x: [voc[each] for each in np.argsort(x)[:-n_words-1:-1]]

In [11]:
# Use imp_words to extract words with the highest weights from our lda model
words_in_topic = ([imp_words(x) for x in lda.components_])

In [12]:
# Examine words
words_in_topic

[['wage', 'minimum', 'bargaining', 'need', 'collective'],
 ['environmental', 'climate', 'change', 'pollution', 'minimum']]

In [13]:
# Join words into single string
main_topics = [' '.join(x) for x in words_in_topic]

In [14]:
# Create dataframe with top words and set index to topic number
main_topics_df = pd.DataFrame({"top_words": main_topics},
                             index=["Topic_" + str(each) for each in range(lda.n_components)])

In [15]:
# Examine top words
main_topics_df

Unnamed: 0,top_words
Topic_0,wage minimum bargaining need collective
Topic_1,environmental climate change pollution minimum


In [16]:
# Examine topic 1
print(f"Top words in topic 1: {main_topics_df['top_words'][0]}")

Top words in topic 1: wage minimum bargaining need collective


In [17]:
# Loop to look at multiple topics' top words
for i, each in enumerate(main_topics_df["top_words"]):
    print(f"Top words in topic {i}: {each}\n")

Top words in topic 0: wage minimum bargaining need collective

Top words in topic 1: environmental climate change pollution minimum



### Extract top words from topics (as a function)

In [None]:
def top_words_f(n_words):
    n_words=n_words
    imp_words = lambda x: [voc[each] for each in np.argsort(x)[:-n_words-1:-1]]
    words_in_topic = ([imp_words(x) for x in lda.components_])
    main_topics = [' '.join(x) for x in words_in_topic]
    df = pd.DataFrame(main_topics).T
    df.columns = columns=["Topic_" + str(each) for each in range(lda.n_components)]
    return df

In [None]:
# Call function and specify number of top words
main_topics_df = top_words_f(3)

In [None]:
# Print top words in topic 1
print(f"Top words in topic 1: {main_topics_df['top_words'][0]}")

In [None]:
# Print top words in each topic
for i, each in enumerate(main_topics_df["top_words"]):
    print(f"Top words in topic {i}: {each}\n")

### Create a document topic matrix from results

In [18]:
cols = ["Topic_" + str(each) for each in range(lda.n_components)]
docs = ["Document_" + str(each) for each in range(X.shape[0])]

In [20]:
docs

['Document_0',
 'Document_1',
 'Document_2',
 'Document_3',
 'Document_4',
 'Document_5',
 'Document_6',
 'Document_7',
 'Document_8']

In [25]:
# Our input weight vector is doc_topics
# This is a 2d array that is the proportion of the words in each document generated from that topic
doc_topics

array([[0.89704343, 0.10295657],
       [0.88051073, 0.11948927],
       [0.90459607, 0.09540393],
       [0.93906499, 0.06093501],
       [0.07536681, 0.92463319],
       [0.09754112, 0.90245888],
       [0.08922875, 0.91077125],
       [0.06556875, 0.93443125],
       [0.0658709 , 0.9341291 ]])

In [21]:
# Create dataframe with term weights and document # and topic # as rows, columns
df_topics = pd.DataFrame(np.round(doc_topics, 2),
                        columns=cols,
                        index=docs)

In [22]:
# Extract most important topics from those values
imp_topic = np.argmax(df_topics.values, axis=1)

In [24]:
text_train

Unnamed: 0,policy_views
0,I would like more funding for pollution mitiga...
1,Environmental regulation and reducing pollutio...
2,Minimum wage and raising living standards
3,Wages are so low and they need to go up whethe...
4,Climate change and environmental degredation a...
5,Investing in renewable fuels and environmental...
6,"Minimum wage and climate change, environmental..."
7,"environment, climate chamge global warming, so..."
8,Increase federal minimum wage to a livable lev...


In [23]:
df_topics

Unnamed: 0,Topic_0,Topic_1
Document_0,0.1,0.9
Document_1,0.09,0.91
Document_2,0.9,0.1
Document_3,0.94,0.06
Document_4,0.08,0.92
Document_5,0.84,0.16
Document_6,0.09,0.91
Document_7,0.93,0.07
Document_8,0.06,0.94


In [25]:
df_topics["top_topic"] = imp_topic

In [27]:
df_topics

Unnamed: 0,Topic_0,Topic_1,top_topic
Document_0,0.1,0.9,1
Document_1,0.09,0.91,1
Document_2,0.9,0.1,0
Document_3,0.94,0.06,0
Document_4,0.08,0.92,1
Document_5,0.84,0.16,0
Document_6,0.09,0.91,1
Document_7,0.93,0.07,0
Document_8,0.06,0.94,1


In [28]:
# Assign name based on domain
df_topics["topic_name"] = np.where(df_topics["top_topic"] == 1,
                                   "env",
                                   "econ")

In [29]:
df_topics

Unnamed: 0,Topic_0,Topic_1,top_topic,topic_name
Document_0,0.1,0.9,1,env
Document_1,0.09,0.91,1,env
Document_2,0.9,0.1,0,econ
Document_3,0.94,0.06,0,econ
Document_4,0.08,0.92,1,env
Document_5,0.84,0.16,0,econ
Document_6,0.09,0.91,1,env
Document_7,0.93,0.07,0,econ
Document_8,0.06,0.94,1,env


In [32]:
# How dominant is the topic in each document. 
# Fundamentally: the proportion of the words in each document generated from that topic
df_topics

Unnamed: 0,Topic_0,Topic_1,top_topic
Document_0,0.9,0.1,0
Document_1,0.88,0.12,0
Document_2,0.9,0.1,0
Document_3,0.94,0.06,0
Document_4,0.08,0.92,2
Document_5,0.1,0.9,2
Document_6,0.09,0.91,2
Document_7,0.07,0.93,2
Document_8,0.07,0.93,2


In [34]:
text_train

Unnamed: 0,policy_views
0,I would like more funding for pollution mitiga...
1,Environmental regulation and reducing pollutio...
2,Minimum wage and raising living standards
3,Wages are so low and they need to go up whethe...
4,Climate change and environmental degredation a...
5,Investing in renewable fuels and environmental...
6,"Minimum wage and climate change, environmental..."
7,"environment, climate chamge global warming, so..."
8,Increase federal minimum wage to a livable lev...


In [35]:
# Assign name based on domain
df_topics["topic_name"] = np.where(df_topics["top_topic"] == 0,
                                   "environment",
                                   "economic")

In [36]:
df_topics

Unnamed: 0,Topic_0,Topic_1,top_topic,topic_name
Document_0,0.9,0.1,0,environment
Document_1,0.88,0.12,0,environment
Document_2,0.9,0.1,0,environment
Document_3,0.94,0.06,0,environment
Document_4,0.08,0.92,2,economic
Document_5,0.1,0.9,2,economic
Document_6,0.09,0.91,2,economic
Document_7,0.07,0.93,2,economic
Document_8,0.07,0.93,2,economic


### PyLDAvis

In [30]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                   dtm=X,
                                   vectorizer=vec,
                                      sort_topics=False)

  default_term_info = default_term_info.sort_values(


In [31]:
pyLDAvis.display(lda_viz)

# pyLDAvis
# Left panel: 
    # global view of topic model
    # centers of circle are distance between topics then projected onto two dimensions
    # area of circles is the overall prevalence of the topic in the whole topic model
    # examine how prevalent each topic is
    # examine how topics relate to each other
# Right panel:
    # Bars represent individual terms that are most useful for interpreting selected topic on left
    # Blue bar represents corpus wide frequencies
    # Red bar represents topic-specific frequencies
    # examine the meaning of each topic

  and should_run_async(code)
