In [1]:
import joblib
import pandas as pd
import numpy as np
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline

tweets = pd.read_csv('../tweets/tweets_clean.csv',
                     header=0,
                     parse_dates=['date'])
tweets.dropna(subset=['lemmas'], inplace=True)
tweets.reset_index(drop=True, inplace=True)

# Load vectorizer
with open('../scripts/topic_modeling_objects/sklearn_vect.joblib', 'rb') as f:
    cv = joblib.load(f)

# Load term frequency matrix
with open('../scripts/topic_modeling_objects/sklearn_CV.joblib', 'rb') as f:
    tf = joblib.load(f)

# Load feature names
with open('../scripts/topic_modeling_objects/sklearn_feature_names.joblib', 'rb') as f:
    tf_names = joblib.load(f)

# Load fitted LDA model
with open('../scripts/topic_modeling_objects/sklearn_LDA_model.joblib', 'rb') as f:
    lda_model = joblib.load(f)

In [2]:
tweets_orig = tweets.text.tolist()
tweet_docs = tweets.lemmas.tolist()

# Topic to document matrix (W) for LDA model
lda_W = lda_model.transform(tf)
# Word to topics matrix (H) for LDA model
lda_H = lda_model.components_


def display_topics(H, W, feature_names, orig_docs, n_words=15, n_docs=25):
    for i, topic in enumerate(H):
        print('Topic {}: '.format(i) + ' '.join([feature_names[word]
                                                 for word in topic.argsort()[: (-n_words - 1): -1]]))
        print()
        top_doc_ids = np.argsort(W[:, i])[:: -1][0: n_docs]
        for doc_id in top_doc_ids:
            print('Tweet:', repr(orig_docs[doc_id]))
        print('-' * 80)


display_topics(lda_H, lda_W, tf_names, tweets_orig, n_docs=0)

Topic 0: trump want news tell politic win break try post way attack pay donald tweet twitter

--------------------------------------------------------------------------------
Topic 1: vote american trump right campaign people big watch muslim job family happy illegal true pjnet

--------------------------------------------------------------------------------
Topic 2: realdonaldtrump call give debate trump money make hate become bring question school hope high order

--------------------------------------------------------------------------------
Topic 3: trump clinton america election great come may voter politic war mean poll russia republican news

--------------------------------------------------------------------------------
Topic 4: obama hillaryclinton gop bill isis dnc clinton care terrorist neverhillary game check kid music claim

--------------------------------------------------------------------------------
Topic 5: black president state first trump talk real potus start ba

In [3]:
topic_names = ['Topic' + str(i) for i in range(len(lda_model.components_))]
doc_names = ['Doc' + str(i) for i in range(len(tweet_docs))]
word_names = ['Word ' + str(i) for i in range(len(tf_names))]

In [4]:
# Create df with the topic probabilities (cols) for each doc (rows)
doc_topic_df = pd.DataFrame(np.round(lda_W, 2), columns=topic_names, index=doc_names)

# Add column with dominant topic for each doc
doc_topic_df['dominant_topic'] = np.argmax(doc_topic_df.values, axis=1)
print(doc_topic_df.head(10))

      Topic0  Topic1  Topic2  Topic3  Topic4  Topic5  Topic6  Topic7  Topic8  \
Doc0    0.03    0.03    0.03    0.53    0.03    0.03    0.03    0.03    0.03   
Doc1    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07   
Doc2    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc3    0.03    0.03    0.03    0.03    0.03    0.03    0.53    0.03    0.03   
Doc4    0.18    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc5    0.38    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc6    0.01    0.01    0.01    0.72    0.15    0.01    0.01    0.01    0.01   
Doc7    0.53    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03   
Doc8    0.84    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc9    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.53    0.03   

      Topic9  Topic10  Topic11  Topic12  Topic13  Topic14  dominant_topic  
Doc0    0.03     0.03     0.03     0.03    

In [5]:
# Create df with document topic distribution (num docs per topic)
topic_dist_df = doc_topic_df['dominant_topic'].value_counts().reset_index(name='Num docs')
topic_dist_df.columns = ['Topic_num', 'Num_docs']
topic_dist_df['Proportion'] = topic_dist_df.Num_docs.apply(lambda x: x / len(tweet_docs))
print()
print(topic_dist_df)


    Topic_num  Num_docs  Proportion
0           0     38583    0.190499
1           1     20961    0.103492
2           3     17749    0.087633
3           2     17081    0.084335
4           4     13792    0.068096
5           5     11587    0.057209
6           7     10370    0.051201
7           9     10341    0.051057
8           6     10131    0.050020
9          10      9860    0.048682
10          8      9697    0.047878
11         11      8996    0.044417
12         12      8578    0.042353
13         14      7659    0.037815
14         13      7152    0.035312


In [6]:
def top_keywords(feature_names, H):
    keywords = np.array(feature_names)
    topic_keywords = []
    for weights in H:
        topic_keywords_locs = (-weights).argsort()
        topic_keywords.append(keywords.take(topic_keywords_locs))

    return pd.DataFrame(topic_keywords)


# Create df with top words (cols) per topic (rows)
topic_words_df = top_keywords(tf_names, lda_H)
topic_words_df.columns = word_names
topic_words_df.index = topic_names
print(topic_words_df.iloc[:, :5])

                  Word 0          Word 1   Word 2    Word 3    Word 4
Topic0             trump            want     news      tell   politic
Topic1              vote        american    trump     right  campaign
Topic2   realdonaldtrump            call     give    debate     trump
Topic3             trump         clinton  america  election     great
Topic4             obama  hillaryclinton      gop      bill      isis
Topic5             black       president    state     first     trump
Topic6              love           trump      law      lead      late
Topic7              take            time     stop      live     pjnet
Topic8              know          medium     work     email      even
Topic9             video         blicqer    thank       die      play
Topic10              new           trump     show      year      maga
Topic11          hillary             day    think       see   country
Topic12             make            need    white  midnight   support
Topic13            w

In [7]:
doc_topic_df.to_csv('../scripts/topic_modeling_objects/topics_per_doc_LDA.csv', index=True)
topic_dist_df.to_csv('../scripts/topic_modeling_objects/docs_per_topic_LDA.csv', index=False)
topic_words_df.to_csv('../scripts/topic_modeling_objects/words_per_topic_LDA.csv', index=True)