In [1]:
import joblib
import pandas as pd
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

%matplotlib inline

tweets = pd.read_csv('../tweets/tweets_clean.csv',
                     header=0,
                     parse_dates=['date'])
tweets.dropna(subset=['lemmas'], inplace=True)

# Load vectorizer
with open('../scripts/topic_modeling_objects/sklearn_vect.joblib', 'rb') as f:
    cv = joblib.load(f)

# Load term frequency matrix
with open('../scripts/topic_modeling_objects/sklearn_CV.joblib', 'rb') as f:
    tf = joblib.load(f)

# Load feature names
with open('../scripts/topic_modeling_objects/sklearn_feature_names.joblib', 'rb') as f:
    tf_names = joblib.load(f)

# Load fitted LDA model
with open('../scripts/topic_modeling_objects/sklearn_LDA_model.joblib', 'rb') as f:
    lda_model = joblib.load(f)


In [2]:
tweets_orig = tweets.text.tolist()
tweet_docs = tweets.lemmas.tolist()

# Topic to document matrix (W) for LDA model
lda_W = lda_model.transform(tf)
# Word to topics matrix (H) for LDA model
lda_H = lda_model.components_


def display_topics(H, W, feature_names, orig_docs, n_words=15, n_docs=25):
    for i, topic in enumerate(H):
        print('Topic {}: '.format(i) + ' '.join([feature_names[word]
                                                 for word in topic.argsort()[: (-n_words - 1): -1]]))
        print()
        top_doc_ids = np.argsort(W[:, i])[:: -1][0: n_docs]
        for doc_id in top_doc_ids:
            print('Tweet:', repr(orig_docs[doc_id]))
        print('-' * 80)


display_topics(lda_H, lda_W, tf_names, tweets_orig, n_docs=0)

Topic 0: trump clinton year show life bill attack potus change donald ask job plan rally dem

--------------------------------------------------------------------------------
Topic 1: news tell people break police really play refugee everyone hear meet school wait name high

--------------------------------------------------------------------------------
Topic 2: think would call woman right need thank let pjnet way help bad ccot realdonaldtrump mean

--------------------------------------------------------------------------------
Topic 3: make trump politic tcot love state big may money usa face sign conservative bring government

--------------------------------------------------------------------------------
Topic 4: good black see trump video watch world first debate could start say must happen people

--------------------------------------------------------------------------------
Topic 5: go man never medium back pay ever trump tweet old many twitter girl believe tax

-----------

In [13]:
topic_names = ['Topic' + str(i) for i in range(len(lda_model.components_))]
doc_names = ['Doc' + str(i) for i in range(len(tweet_docs))]
word_names = ['Word ' + str(i) for i in range(len(tf_names))]

In [14]:
# Create df with the topic probabilities (cols) for each doc (rows)
doc_topic_df = pd.DataFrame(np.round(lda_W, 2), columns=topic_names, index=doc_names)

# Add column with dominant topic for each doc
doc_topic_df['dominant_topic'] = np.argmax(doc_topic_df.values, axis=1)
print(doc_topic_df.head(10))

       Topic0  Topic1  Topic2  Topic3  Topic4  Topic5  Topic6  Topic7  Topic8  \
Doc0     0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02   
Doc1     0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07   
Doc2     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc3     0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03   
Doc4     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.68    0.01   
Doc5     0.17    0.01    0.01    0.60    0.01    0.01    0.01    0.01    0.01   
Doc6     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.61    0.16   
Doc7     0.03    0.03    0.53    0.03    0.03    0.03    0.03    0.03    0.03   
Doc8     0.68    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc9     0.03    0.03    0.03    0.03    0.53    0.03    0.03    0.03    0.03   
Doc10    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.77    0.02   
Doc11    0.01    0.01    0.0

In [15]:
# Create df with document topic distribution (num docs per topic)
topic_dist_df = doc_topic_df['dominant_topic'].value_counts().reset_index(name='Num docs')
topic_dist_df.columns = ['Topic_num', 'Num_docs']
topic_dist_df['Proportion'] = topic_dist_df.Num_docs.apply(lambda x: x / len(tweet_docs))
print()
print(topic_dist_df)


    Topic_num  Num_docs  Proportion
0           0     33299    0.164379
1           2     21810    0.107664
2           1     18944    0.093516
3           3     17015    0.083994
4           4     16211    0.080025
5           8     12821    0.063290
6           9     12168    0.060067
7           6     11281    0.055688
8           5     10857    0.053595
9           7      9963    0.049182
10         10      9570    0.047242
11         11      9312    0.045968
12         12      7546    0.037251
13         14      6396    0.031574
14         13      5381    0.026563


In [16]:
def top_keywords(feature_names, H):
    keywords = np.array(feature_names)
    topic_keywords = []
    for weights in H:
        topic_keywords_locs = (-weights).argsort()
        topic_keywords.append(keywords.take(topic_keywords_locs))

    return pd.DataFrame(topic_keywords)


# Create df with top words (cols) per topic (rows)
topic_words_df = top_keywords(tf_names, lda_H)
topic_words_df.columns = word_names
topic_words_df.index = topic_names
print(topic_words_df.iloc[:, :5])

           Word 0           Word 1    Word 2          Word 3    Word 4
Topic0      trump          clinton      year            show      life
Topic1       news             tell    people           break    police
Topic2      think            would      call           woman     right
Topic3       make            trump   politic            tcot      love
Topic4       good            black       see           trump     video
Topic5         go              man     never          medium      back
Topic6       know        president   clinton           leave      find
Topic7       want            trump      stop           great      time
Topic8        get              say       day           trump   country
Topic9      white  realdonaldtrump   blicqer             die      keep
Topic10      vote         american      maga           trump  midnight
Topic11     obama          america       gop  hillaryclinton      give
Topic12       new            trump  campaign            post      real
Topic1

In [18]:
doc_topic_df.to_csv('../scripts/topic_modeling_objects/topics_per_doc_LDA.csv', index=True)
topic_dist_df.to_csv('../scripts/topic_modeling_objects/docs_per_topic_LDA.csv', index=False)
topic_words_df.to_csv('../scripts/topic_modeling_objects/words_per_topic_LDA.csv', index=True)

In [5]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, tf, cv, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# Construct k-means clusters
clusters = KMeans(n_clusters=15,
                  random_state=100).fit_predict(lda_W)
colors = ['#E53935', '#D81B60', '#8E24AA', '#5E35B1', '#3949AB',
          '#1976D2', '#0288D1', '#0097A7', '#00796B', '#8D6E63',
          '#689F38', '#AFB42B', '#FBC02D', '#90A4AE', '#F57C00']

# Build Singluar Value Decomposition (SVD) model
svd_model = TruncatedSVD(n_components=2)
lda_output_svd = svd_model.fit_transform(lda_W)

x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

In [None]:
fig = plt.figure(figsize=(12, 12))
for topic, color in zip(np.unique(clusters), colors):
    i = np.where(clusters == topic)
    plt.scatter(x[i], y[i], c=color, label=topic, alpha=0.2)

plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('Segregation of Topic Clusters')
leg = plt.legend(title='Topic', loc='best', ncol=3)

for lh in leg.legendHandles:
    lh.set_alpha(1)

fig.savefig('../visuals/LDA_topic_clusters.png')
plt.show()