In [1]:
import joblib
import pandas as pd
import numpy as np
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline

tweets = pd.read_csv('../tweets/tweets_clean.csv',
                     header=0,
                     parse_dates=['date'])
tweets.dropna(subset=['lemmas'], inplace=True)
tweets.reset_index(drop=True, inplace=True)

# Load vectorizer
with open('../scripts/topic_modeling_objects/sklearn_vect.joblib', 'rb') as f:
    cv = joblib.load(f)

# Load term frequency matrix
with open('../scripts/topic_modeling_objects/sklearn_CV.joblib', 'rb') as f:
    tf = joblib.load(f)

# Load feature names
with open('../scripts/topic_modeling_objects/sklearn_feature_names.joblib', 'rb') as f:
    tf_names = joblib.load(f)

# Load fitted LDA model
with open('../scripts/topic_modeling_objects/sklearn_LDA_model.joblib', 'rb') as f:
    lda_model = joblib.load(f)

In [2]:
tweets_orig = tweets.text.tolist()
tweet_docs = tweets.lemmas.tolist()

# Topic to document matrix (W) for LDA model
lda_W = lda_model.transform(tf)
# Word to topics matrix (H) for LDA model
lda_H = lda_model.components_


def display_topics(H, W, feature_names, orig_docs, n_words=15, n_docs=25):
    for i, topic in enumerate(H):
        print('Topic {}: '.format(i) + ' '.join([feature_names[word]
                                                 for word in topic.argsort()[: (-n_words - 1): -1]]))
        print()
        top_doc_ids = np.argsort(W[:, i])[:: -1][0: n_docs]
        for doc_id in top_doc_ids:
            print('Tweet:', repr(orig_docs[doc_id]))
        print('-' * 80)


display_topics(lda_H, lda_W, tf_names, tweets_orig, n_docs=0)

Topic 0: trump great lie first play donald always job word thing child anti check wait music

--------------------------------------------------------------------------------
Topic 1: support try much use put become hear god city away sure anyone blacklivesmatter be record

--------------------------------------------------------------------------------
Topic 2: realdonaldtrump campaign blicqer die keep big may free dnc liberal law nothing something merkelmussbleiben tax

--------------------------------------------------------------------------------
Topic 3: make new love white live really lose house read speech care leave remember everyone fact

--------------------------------------------------------------------------------
Topic 4: hillary see go people man never come gop back start ever black old happen racist

--------------------------------------------------------------------------------
Topic 5: call video break hillaryclinton debate police must girl candidate russian name re

In [3]:
topic_names = ['Topic' + str(i) for i in range(len(lda_model.components_))]
doc_names = ['Doc' + str(i) for i in range(len(tweet_docs))]
word_names = ['Word ' + str(i) for i in range(len(tf_names))]

In [4]:
# Create df with the topic probabilities (cols) for each doc (rows)
doc_topic_df = pd.DataFrame(np.round(lda_W, 2), columns=topic_names, index=doc_names)

# Add column with dominant topic for each doc
doc_topic_df['dominant_topic'] = np.argmax(doc_topic_df.values, axis=1)
print(doc_topic_df.head(10))

      Topic0  Topic1  Topic2  Topic3  Topic4  Topic5  Topic6  Topic7  Topic8  \
Doc0    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02   
Doc1    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07   
Doc2    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc3    0.03    0.53    0.03    0.03    0.03    0.03    0.03    0.03    0.03   
Doc4    0.01    0.01    0.01    0.01    0.01    0.01    0.18    0.01    0.01   
Doc5    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.17    0.01   
Doc6    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.18    0.01   
Doc7    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03   
Doc8    0.01    0.01    0.01    0.18    0.01    0.01    0.01    0.68    0.01   
Doc9    0.03    0.03    0.03    0.53    0.03    0.03    0.03    0.03    0.03   

      Topic9  Topic10  Topic11  Topic12  Topic13  Topic14  dominant_topic  
Doc0    0.02     0.02     0.69     0.02    

In [5]:
# Create df with document topic distribution (num docs per topic)
topic_dist_df = doc_topic_df['dominant_topic'].value_counts().reset_index(name='Num docs')
topic_dist_df.columns = ['Topic_num', 'Num_docs']
topic_dist_df['Proportion'] = topic_dist_df.Num_docs.apply(lambda x: x / len(tweet_docs))
print()
print(topic_dist_df)


    Topic_num  Num_docs  Proportion
0           0     29039    0.143359
1           2     24763    0.122250
2           1     17709    0.087426
3           3     15079    0.074442
4           4     14414    0.071159
5           7     13285    0.065585
6           8     11923    0.058861
7          12     11727    0.057894
8           9     11439    0.056472
9           5     10331    0.051002
10          6      9857    0.048662
11         14      9310    0.045961
12         11      8464    0.041785
13         10      8217    0.040566
14         13      7004    0.034577


In [6]:
def top_keywords(feature_names, H):
    keywords = np.array(feature_names)
    topic_keywords = []
    for weights in H:
        topic_keywords_locs = (-weights).argsort()
        topic_keywords.append(keywords.take(topic_keywords_locs))

    return pd.DataFrame(topic_keywords)


# Create df with top words (cols) per topic (rows)
topic_words_df = top_keywords(tf_names, lda_H)
topic_words_df.columns = word_names
topic_words_df.index = topic_names
print(topic_words_df.iloc[:, :5])

                  Word 0    Word 1    Word 2          Word 3    Word 4
Topic0             trump     great       lie           first      play
Topic1           support       try      much             use       put
Topic2   realdonaldtrump  campaign   blicqer             die      keep
Topic3              make       new      love           white      live
Topic4           hillary       see        go          people       man
Topic5              call     video     break  hillaryclinton    debate
Topic6              stop     trump      post          attack     potus
Topic7             trump      want   politic          medium       win
Topic8              take      news     trump            tell     today
Topic9               say      good       day             get     trump
Topic10             year     woman      life           thank       way
Topic11             know   america  midnight           world      work
Topic12            trump      vote      tcot       president  american
Topic1

In [7]:
doc_topic_df.to_csv('../scripts/topic_modeling_objects/topics_per_doc_LDA.csv', index=True)
topic_dist_df.to_csv('../scripts/topic_modeling_objects/docs_per_topic_LDA.csv', index=False)
topic_words_df.to_csv('../scripts/topic_modeling_objects/words_per_topic_LDA.csv', index=True)