In [7]:
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tweets = pd.read_csv('../tweets/tweets_clean.csv',
                     header=0,
                     parse_dates=['date'])
tweets.dropna(subset=['lemmas'], inplace=True)

# Load term frequency matrix
with open('../scripts/topic_modeling_objects/sklearn_CV.joblib', 'rb') as f:
    tf = joblib.load(f)

# Load feature names
with open('../scripts/topic_modeling_objects/sklearn_feature_names.joblib', 'rb') as f:
    tf_names = joblib.load(f)

# Load fitted LDA model
with open('../scripts/topic_modeling_objects/sklearn_LDA_model.joblib', 'rb') as f:
    lda_model = joblib.load(f)


In [12]:
tweets_orig = tweets.text.tolist()
tweet_docs = tweets.lemmas.tolist()

# Topic to document matrix (W) for LDA model
lda_W = lda_model.transform(tf)
# Word to topics matrix (H) for LDA model
lda_H = lda_model.components_


def display_topics(H, W, feature_names, orig_docs, n_words=15, n_docs=25):
    for i, topic in enumerate(H):
        print('Topic {}: '.format(i) + ' '.join([feature_names[word]
                                                 for word in topic.argsort()[: (-n_words - 1): -1]]))
        print()
        top_doc_ids = np.argsort(W[:, i])[:: -1][0: n_docs]
        for doc_id in top_doc_ids:
            print('Tweet:', repr(orig_docs[doc_id]))
        print('-' * 80)


display_topics(lda_H, lda_W, tf_names, tweets_orig)

202574
(202574, 15)
Topic 0: trump clinton year show life bill attack potus change donald ask job plan rally dem

Tweet: 'RT @MattaAbraham1: Donald Trump offers retired Lt. Gen. Michael Flynn job https://t.co/g1zLTtflhj via @MailOnline'
Tweet: 'RT @mortreport: Leonard Fournette showing up at 240 lbs, then having a weak 28.5 inch vertical jump are more red flags after a season with…'
Tweet: 'To change your life, you have to change yourself. To change yourself, you have to change your mindset.'
Tweet: '#MichelleObama addresses Melania Trump’s plagiarism and does a killer impression of Barack \r\n#blackgirlsmagic https://t.co/6hygCFL8SH'
Tweet: 'RT @TrayneshaCole: #MichelleObama addresses Melania Trump’s plagiarism and does a killer impression of Barack \r\n#blackgirlsmagic https://t.c…'
Tweet: 'RT @WorldTruthTV: After 56 Years This Woman Learns That One Simple Mistake In The Hospital Changed The Course Of Her Entire Life https://t.…'
Tweet: 'RT @warriorwithaF: Tim Kaine is only about Tim

Topic 7: want trump stop great time voter always poll national cnn order fake miss little trumppence16

Tweet: 'RT @rapstationradio: Playing now on #RAPStationRadio - "Little Bit of This (Original Mix) by GTA feat. Vince Staples @HotWorldPowered https…'
Tweet: 'RT @rapstationradio: #NowPlaying: "Little Bit of This (Original Mix)" by GTA feat. Vince Staples https://t.co/c7qtiDL3Dx'
Tweet: 'RT @THETXEMBASSY: It is time we stopped sacrificing our national security on the spurious altar of “diversity.”\xa0\r\r\nhttps://t.co/TxpctcJ7Yz\r\r\n#B…'
Tweet: 'RT @relaxingview: I miss you I miss you I miss you I miss you I miss you'
Tweet: 'RT @zoeychmabers: I want\r\r\nI want\r\r\nI want\r\r\nI want\r\r\nI want https://t.co/mdbFU4mX96'
Tweet: 'RT @ofccadjust: #HillaryRottenClinton  Donald Trump NUKE AD @Mr_Pinko ORIGINAL\r\n\r\nhttps://t.co/QhpjdVpWOo'
Tweet: 'RT @rapstationradio: #NowPlaying on #RAPStationRadio "Attention Deficit Disorder" Apathy https://t.co/cNjdrJVLGd'
Tweet: 'RT @rapstationra

Tweet: 'RT @FPRNRadio: By The Rundown Live Lily Allen threatened Tommy Robinson with legal action because she lost an argument on... https://t.co/p…'
Tweet: 'RT @justenoughtrope: JET turns 200; come join the party! Plus #RogueOne &amp; #BladeRunner2049 talk! #podernfamily https://t.co/C48sFvkfpP http…'
Tweet: 'Trust your coach. Trust your team. Trust yourself.'
Tweet: 'Trust your coach. — Trust your team. — Trust yourself.'
Tweet: 'Trust your coach. Trust your team. Trust yourself.'
Tweet: 'RT @Cernovich: They are young, they are horny, and they are coming to the West. https://t.co/PHZyNMSVdV'
Tweet: 'RT @dmataconis: Ready To Feel Like A Failure? Joan Of Arc Was Only 19 When She Was Burned At The Stake http://t.co/S2j1IFm4y9'
Tweet: 'RT @dmataconis: Ready To Feel Like A Failure? Joan Of Arc Was Only 19 When She Was Burned At The Stake http://t.co/S2j1IFm4y9'
Tweet: "RT @ManualMystique: #SometimesItsOkTo but don't forget ~ Not if I haven't had my coffee yet. ☕😉"
Tweet: 'RT @emenogu_phil

In [13]:
topic_names = ['Topic' + str(i) for i in range(len(lda_model.components_))]
doc_names = ['Doc' + str(i) for i in range(len(tweet_docs))]
word_names = ['Word ' + str(i) for i in range(len(tf_names))]

In [14]:
# Create df with the topic probabilities (cols) for each doc (rows)
doc_topic_df = pd.DataFrame(np.round(lda_W, 2), columns=topic_names, index=doc_names)

# Add column with dominant topic for each doc
doc_topic_df['dominant_topic'] = np.argmax(doc_topic_df.values, axis=1)
print(doc_topic_df.head(25))

       Topic0  Topic1  Topic2  Topic3  Topic4  Topic5  Topic6  Topic7  Topic8  \
Doc0     0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02   
Doc1     0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07    0.07   
Doc2     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc3     0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03    0.03   
Doc4     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.68    0.01   
Doc5     0.17    0.01    0.01    0.60    0.01    0.01    0.01    0.01    0.01   
Doc6     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.61    0.16   
Doc7     0.03    0.03    0.53    0.03    0.03    0.03    0.03    0.03    0.03   
Doc8     0.68    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
Doc9     0.03    0.03    0.03    0.03    0.53    0.03    0.03    0.03    0.03   
Doc10    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.77    0.02   
Doc11    0.01    0.01    0.0

In [15]:
# Create df with document topic distribution (num docs per topic)
topic_dist_df = doc_topic_df['dominant_topic'].value_counts().reset_index(name='Num docs')
topic_dist_df.columns = ['Topic_num', 'Num_docs']
topic_dist_df['Proportion'] = topic_dist_df.Num_docs.apply(lambda x: x / len(tweet_docs))
print()
print(topic_dist_df)


    Topic_num  Num_docs  Proportion
0           0     33299    0.164379
1           2     21810    0.107664
2           1     18944    0.093516
3           3     17015    0.083994
4           4     16211    0.080025
5           8     12821    0.063290
6           9     12168    0.060067
7           6     11281    0.055688
8           5     10857    0.053595
9           7      9963    0.049182
10         10      9570    0.047242
11         11      9312    0.045968
12         12      7546    0.037251
13         14      6396    0.031574
14         13      5381    0.026563


In [16]:
def top_keywords(feature_names, H):
    keywords = np.array(feature_names)
    topic_keywords = []
    for weights in H:
        topic_keywords_locs = (-weights).argsort()
        topic_keywords.append(keywords.take(topic_keywords_locs))

    return pd.DataFrame(topic_keywords)


# Create df with top words (cols) per topic (rows)
topic_words_df = top_keywords(tf_names, lda_H)
topic_words_df.columns = word_names
topic_words_df.index = topic_names
print(topic_words_df.iloc[:, :5])

           Word 0           Word 1    Word 2          Word 3    Word 4
Topic0      trump          clinton      year            show      life
Topic1       news             tell    people           break    police
Topic2      think            would      call           woman     right
Topic3       make            trump   politic            tcot      love
Topic4       good            black       see           trump     video
Topic5         go              man     never          medium      back
Topic6       know        president   clinton           leave      find
Topic7       want            trump      stop           great      time
Topic8        get              say       day           trump   country
Topic9      white  realdonaldtrump   blicqer             die      keep
Topic10      vote         american      maga           trump  midnight
Topic11     obama          america       gop  hillaryclinton      give
Topic12       new            trump  campaign            post      real
Topic1

In [18]:
doc_topic_df.to_csv('../scripts/topic_modeling_objects/topics_per_doc_LDA.csv', index=True)
topic_dist_df.to_csv('../scripts/topic_modeling_objects/docs_per_topic_LDA.csv', index=False)
topic_words_df.to_csv('../scripts/topic_modeling_objects/words_per_topic_LDA.csv', index=True)