In [7]:
import pandas as pd
#import modules and test files

train_set = pd.read_csv("./tweets_data.csv")
train_set.head()
train_set = train_set[:12000]

In [8]:
import pandas as pd
import spacy
import re
import nltk

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from nltk.corpus import stopwords

In [9]:
# Helper function    
def lemmatize(word_list, ptags):
    '''Lemmatizes words based on allowed postags, input format is list of sublists 
       with strings'''
    spC = spacy.load('en_core_web_sm')
    lem_lists =[]
    for vec in word_list:
        sentence = spC(" ".join(vec))
        lem_lists.append([token.lemma_ for token in sentence if token.pos_ in ptags])
    
    return lem_lists

In [10]:
#txt requirements
# pandas
# numpy
# spacy>=2.2.4
# nltk>=3.4.5
# gensim>=3.8.3
# plotnine>=0.6.0
# tomotopy>=0.7.1
# wordcloud>=1.7.0

import numpy as np 
import spacy
import nltk as nltk
import gensim
import plotnine
import tomotopy
import re
# import wordcloud
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
st_words = stopwords.words('english')
extra_stops=['from','subject','re', 'edu','use']
st_words.extend(extra_stops)

In [11]:
doc_list = train_set.text.values.tolist()
word_list = [simple_preprocess(txt, deacc=True, min_len=3) for txt in doc_list]
bigram = Phrases(word_list, min_count=5, threshold=100) # use original wordlist to build model
bigram_model = Phraser(bigram)
word_list_nostops = [[word for word in txt if word not in st_words] for txt in word_list]
word_bigrams = [bigram_model[w_vec] for w_vec in word_list_nostops]
word_list_lemmatized = lemmatize(word_bigrams, ptags=['NOUN','VERB','ADV','ADJ'])
word_list_lemmatized[0][:7]


['upset', 'update', 'facebook', 'texting', 'cry', 'result', 'school']

In [None]:
len(word_list_lemmatized[0])

15

In [None]:
len(word_list_lemmatized)

654

In [6]:
import tomotopy as tp
term_weight = tp.TermWeight.ONE
hdp = tp.HDPModel(tw=term_weight, min_cf=5, rm_top=7, gamma=1,
                  alpha=0.1, initial_k=10, seed=99999)
                  

In [11]:
# for vec in word_list_lemmatized:
#     hdp.add_doc(vec)

# # Initiate sampling burn-in  (i.e. discard N first iterations)
# hdp.burn_in = 100
# hdp.train(0)
# print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs,
#       ', Num words:', hdp.num_words)
# print('Removed top words:', hdp.removed_top_words)

# # Train model
# for i in range(0, 1000, 100):
#     hdp.train(100) # 100 iterations at a time
#     print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k))

In [21]:
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import sys

def train_HDPmodel(hdp, word_list, mcmc_iter, burn_in=100, quiet=False):
    '''Wrapper function to train tomotopy HDP Model object
    
    *** Inputs**
    hdp: obj -> initialized HDPModel model
    word_list: list -> lemmatized word list of lists
    mcmc_iter : int -> number of iterations to train the model
    burn_in: int -> MC burn in iterations
    quiet: bool -> flag whether to print iteration LL and Topics, if True nothing prints out
    
    ** Returns**
    hdp: trained HDP Model 
    '''
    
    # Add docs to train
    for vec in word_list:
        print(vec)
        hdp.add_doc(vec)

    # Initiate MCMC burn-in 
    hdp.burn_in = 100
    hdp.train(0)
    print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs, ', Num words:', hdp.num_words)
    print('Removed top words:', hdp.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)

    # Train model
    step=round(mcmc_iter*0.10)
    for i in range(0, mcmc_iter, step):
        hdp.train(step, workers=3)
        if not quiet:
            print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k))
        
    print("Done\n")  
    
    return hdp
    
        
def get_hdp_topics(hdp, top_n=10):
    '''Wrapper function to extract topics from trained tomotopy HDP model 
    
    ** Inputs **
    hdp:obj -> HDPModel trained model
    top_n: int -> top n words in topic based on frequencies
    
    ** Returns **
    topics: dict -> per topic, an arrays with top words and associated frequencies 
    '''
    
    # Get most important topics by # of times they were assigned (i.e. counts)
    sorted_topics = [k for k, v in sorted(enumerate(hdp.get_count_by_topics()), key=lambda x:x[1], reverse=True)]

    topics=dict()
    
    # For topics found, extract only those that are still assigned
    for k in sorted_topics:
        if not hdp.is_live_topic(k): continue # remove un-assigned topics at the end (i.e. not alive)
        topic_wp =[]
        for word, prob in hdp.get_topic_words(k, top_n=top_n):
            topic_wp.append((word, prob))

        topics[k] = topic_wp # store topic word/frequency array
        
    return topics

In [22]:
tw_list = [tp.TermWeight.ONE, # all terms weighted equally
           tp.TermWeight.PMI, # Pointwise Mutual Information term weighting
           tp.TermWeight.IDF] # down-weights high frequency terms, upweights low freq ones

tw_names = ['one', 'pmi', 'idf']
model_topics =[]

for i, term_weight in enumerate(tw_list):
    hdp = tp.HDPModel(tw=term_weight, min_cf=5, rm_top=7, gamma=1, alpha=0.1,
                     initial_k=10, seed=99999)
    
    print("Model " + tw_names[i] )
    hdp = train_HDPmodel(hdp, word_list_lemmatized, mcmc_iter=1000)
    hdp.save(''.join(['models/hdp_model_',tw_names[i],".bin"]))
    
    model_topics.append(get_hdp_topics(hdp, top_n=10))
    

Model one
['lean', 'empty', 'guard', 'desk', 'hear', 'footstep', 'straighten', 'shove', 'hand', 'uniform', 'figure', 'emerge', 'door', 'lead', 'basement']
['nightstand', 'bed', 'analogue', 'phone', 'brochure', 'take', 'front', 'desk', 'advertise', 'free', 'steak', 'dinner', 'lounge']
['happy', 'beam', 'hardwood', 'car', 'job', 'girlfriend', 'life', 'good', 'get', 'well', 'imminent', 'release', 'societal', 'obligation', 'inconvenience', 'full', 'time', 'job', 'none', 'benefit']
['cave', 'deep', 'green', 'water', 'search', 'xibalba', 'underworld', 'alone', 'save', 'snake', 'cave', 'cricket', 'put', 'miner', 'lamp', 'begin', 'explore', 'stumble', 'swallow', 'large', 'gulp', 'water', 'spit', 'muddy', 'gritty', 'aftertaste', 'recover', 'ease', 'pair', 'flipper', 'move', 'far', 'cave', 'meander', 'tunnel']
['sophie', 'forget', 'completely', 'return', 'eventually', 'enough', 'surprise', 'easy', 'forget', 'body', 'ignore', 'wound', 'precise', 'moment', 'helper', 'mind', 'drape', 'sheet', 'pain

RuntimeError: Either `words` or `rawWords` must be filled.