# Create a topic model for each community per period

In [1]:
import nttc

# Change data_path to your own system folder, where these files reside
data_path = 'test-data'
__file__ = 'test.csv'

# Specify data types in your corpus, used in .get_csv()
dtype_dict={
    'community': str,
    'tweets': str,
    'retweets_count': int,
    'link': str,
    'username': str
}

# 1. Load CSV
df_tweets = nttc.get_csv(data_path, __file__, dtype_dict)
df_tweets[:5]

Unnamed: 0,community,tweets,retweets_count,username,user_id
0,0,Ivewib bo joshuv zuwfu jo ju pahma afozadcem k...,1540,akgifur,372359695
1,0,Fed lo wa ihitfol fom new rubedawi izu jompe k...,735,manrij,591230
2,0,Rasveca fokat tebtag tewteg jifazasih ipu wew ...,491,okajuhe,9019686939508610000
3,0,Iwfaz vudfosna cesaj boigab rel op zuve tolake...,465,niegeki,5543039845159
4,0,Dujfaw bufsis fo ami uh huf lottu kip izura ac...,454,umu,39622197606155


In [2]:
# 2. Get community numbers into a List
## Use either period_obj or dft_comm_col argument
comm_list = nttc.get_comm_nums(dft_comm_col=df_tweets['community'], period_obj=None)
comm_list

['0', '1', '2', '5', '6', '7', '8', '9', '10', '12']

In [3]:
# 3. Write dictionary of tweets organized by per Community perspective
dict_all_comms = nttc.comm_dict_writer(
                    comm_list=comm_list, 
                    df_content=df_tweets, 
                    comm_col='community', 
                    content_col='tweets'
)
# Outputs Dict of objects ( communitiesObject() ) keyed per Community label
dict_all_comms

{'0': <nttc.nttc.communitiesObject at 0x10498fdd8>,
 '1': <nttc.nttc.communitiesObject at 0x10498fe48>,
 '10': <nttc.nttc.communitiesObject at 0x1049ac860>,
 '12': <nttc.nttc.communitiesObject at 0x1049ac9e8>,
 '2': <nttc.nttc.communitiesObject at 0x10498ffd0>,
 '5': <nttc.nttc.communitiesObject at 0x10498fb38>,
 '6': <nttc.nttc.communitiesObject at 0x1049ac0b8>,
 '7': <nttc.nttc.communitiesObject at 0x1049ac240>,
 '8': <nttc.nttc.communitiesObject at 0x1049ac550>,
 '9': <nttc.nttc.communitiesObject at 0x1049ac6d8>}

In [4]:
# 4 . Process tweets for each community
split_dict_all_comms = nttc.split_community_tweets(
                            dict_comm_obj=dict_all_comms, 
                            col_name='tweets',
                            sample_size_percentage=1
)
split_dict_all_comms

Length of community 0 data set: 180
Sample size:  180
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Length of community 1 data set: 500
Sample size:  500
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Length of community 2 data set: 358
Sample size:  358
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Length of community 5 data set: 172
Sample size:  172
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Length of community 6 data set: 500
Sample size:  500
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Length of commu

{'0': <nttc.nttc.communitiesObject at 0x10498fdd8>,
 '1': <nttc.nttc.communitiesObject at 0x10498fe48>,
 '10': <nttc.nttc.communitiesObject at 0x1049ac860>,
 '12': <nttc.nttc.communitiesObject at 0x1049ac9e8>,
 '2': <nttc.nttc.communitiesObject at 0x10498ffd0>,
 '5': <nttc.nttc.communitiesObject at 0x10498fb38>,
 '6': <nttc.nttc.communitiesObject at 0x1049ac0b8>,
 '7': <nttc.nttc.communitiesObject at 0x1049ac240>,
 '8': <nttc.nttc.communitiesObject at 0x1049ac550>,
 '9': <nttc.nttc.communitiesObject at 0x1049ac6d8>}

In [6]:
# 5. Build the topic model
tms_full_dict = nttc.tm_maker(random_seed=2018, 
                                split_comms=split_dict_all_comms, 
                                num_topics=5,
                                random_state=100,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                alpha='auto',
                                per_word_topics=True,#pass any of the following gensim LDATopicModel() object arguments here
                                single=False #If passing full Dict of community objects, set to False
                             )


[nltk_data] Downloading package wordnet to /Users/chrisl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

 0  Perplexity:  -7.113020190861234

 0  Coherence Score:  0.5255200614387542

 1  Perplexity:  -6.466233739788675

 1  Coherence Score:  0.5402824795126752

 2  Perplexity:  -6.6606385798741945

 2  Coherence Score:  0.4995265929071183

 5  Perplexity:  -7.144529900432735

 5  Coherence Score:  0.43154799544780864

 6  Perplexity:  -6.469372752611412

 6  Coherence Score:  0.48103056449877

 7  Perplexity:  -8.125527086721654

 7  Coherence Score:  0.4460353962196401

 8  Perplexity:  -7.306125270072641

 8  Coherence Score:  0.44865751535447257

 9  Perplexity:  -7.295336417372814

 9  Coherence Score:  0.4483180592307649

 10  Perplexity:  -7.659357865604488

 10  Coherence Score:  0.6121089929075394

 12  Perplexity:  -6.351716878389683

 12  Coherence Score:  0.39957790713822516

 Modeling complete.


In [7]:
tms_full_dict

{'0': <nttc.nttc.communitiesObject at 0x10498fdd8>,
 '1': <nttc.nttc.communitiesObject at 0x10498fe48>,
 '10': <nttc.nttc.communitiesObject at 0x1049ac860>,
 '12': <nttc.nttc.communitiesObject at 0x1049ac9e8>,
 '2': <nttc.nttc.communitiesObject at 0x10498ffd0>,
 '5': <nttc.nttc.communitiesObject at 0x10498fb38>,
 '6': <nttc.nttc.communitiesObject at 0x1049ac0b8>,
 '7': <nttc.nttc.communitiesObject at 0x1049ac240>,
 '8': <nttc.nttc.communitiesObject at 0x1049ac550>,
 '9': <nttc.nttc.communitiesObject at 0x1049ac6d8>}

## Visually plot the TM, if desired

In [8]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
c12_vis = pyLDAvis.gensim.prepare(tms_full_dict['12'].model, tms_full_dict['12'].corpus, tms_full_dict['12'].id2word)
c12_vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [9]:
# Model 1 community at a time
tm_dict_c1 = nttc.tm_maker(random_seed=2018, 
                                split_comms=split_dict_all_comms['1'], 
                                num_topics=3, #modified topic number for example
                                random_state=100,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                alpha='auto',
                                per_word_topics=True,#pass any of the following gensim LDATopicModel() object arguments here
                                single=True #Pass as true, if sending only 1 community object from Dict
                          )


[nltk_data] Downloading package wordnet to /Users/chrisl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Perplexity:  -6.76696529989689

Coherence Score:  0.645373800191739

 Modeling complete.


In [10]:
pyLDAvis.enable_notebook()
c1_vis = pyLDAvis.gensim.prepare(tms_full_dict['1'].model, tms_full_dict['1'].corpus, tms_full_dict['1'].id2word,mds='mmds')
c1_vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Community TM Printouts

In [11]:
from pprint import pprint
for fo in tms_full_dict:
    print('Community ', fo, ':\n\n', tms_full_dict[fo].model.print_topics(), '\n\n')

Community  0 :

 [(0, '0.006*"za" + 0.006*"zo" + 0.006*"ce" + 0.006*"dom" + 0.006*"er" + 0.004*"na" + 0.004*"ba" + 0.003*"le" + 0.003*"ti" + 0.003*"viikeni"'), (1, '0.009*"le" + 0.006*"mew" + 0.003*"kaasle" + 0.003*"pa" + 0.003*"cu" + 0.003*"opni" + 0.003*"nuhuip" + 0.003*"ce" + 0.003*"dus" + 0.003*"buvtip"'), (2, '0.010*"tu" + 0.008*"lo" + 0.008*"zu" + 0.005*"wu" + 0.005*"ce" + 0.005*"pa" + 0.005*"za" + 0.005*"rel" + 0.005*"wa" + 0.004*"gid"'), (3, '0.008*"lu" + 0.005*"vo" + 0.005*"hi" + 0.005*"ro" + 0.005*"ra" + 0.004*"ju" + 0.004*"zi" + 0.004*"wu" + 0.003*"da" + 0.003*"ze"'), (4, '0.009*"ha" + 0.008*"te" + 0.007*"ci" + 0.006*"hu" + 0.005*"zu" + 0.005*"ze" + 0.004*"wa" + 0.004*"ru" + 0.004*"ho" + 0.004*"su"')] 


Community  1 :

 [(0, '0.008*"ju" + 0.006*"ce" + 0.006*"va" + 0.006*"le" + 0.004*"zu" + 0.004*"zi" + 0.004*"lo" + 0.004*"hi" + 0.004*"ha" + 0.004*"pa"'), (1, '0.005*"zu" + 0.005*"ha" + 0.005*"te" + 0.005*"wu" + 0.004*"ce" + 0.004*"su" + 0.004*"cos" + 0.004*"ru" + 0.004*"dom"