# R3 - Topic Modeling

# Sample Approach

In [None]:
print '\n============= Count X 22,000'
tM = topicModeling(alltext,verbose=1)
vec = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=5,passes=25)
tM.printTopics()
tM.save('ALL22k05')


print '\n============= Count X 22,000'
tM.fitTopics(topic_ct=10,passes=25)


print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()
tM.save('ALL22k10')



print '\n============= Count X 22,000'
tM.fitTopics(topic_ct=25,passes=25)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()
tM.save('ALL22k25')

# Explanation of some of the functions

### Fit Text .fitText()

Fit text accomplishes 3 main tasks - 
1. Turns the complete job posting into words/counts with Count Vectorizer
2. Creates the corpus for LDA modeling
3. Creates the vocab for LDA modeling
4. Also stores the array for future usage in a Dataframe format

```
def fitText(self):
        start = datetime.datetime.now()
        self.verboseMsg('text===>word start: fitting text of size %d documents' % (len(self.sometext)))

        self.X = self.vectorizer.fit_transform(self.sometext)

        self.verboseMsg('text===>word summarizing vocab and creating corpus ')

        #prepping summary
        self.df_X = pd.DataFrame(self.X.toarray(), columns=self.vectorizer.get_feature_names())
        self.wordFreq = self.df_X.sum().sort_values(ascending = False)        

        self.vocab = {v: k for k, v in self.vectorizer.vocabulary_.iteritems()}
        self.corpus = matutils.Sparse2Corpus(self.X, documents_columns=False)    

        self.verboseMsg('text===>word complete. ')
        print datetime.datetime.now() - start
```

### Fit Topics .fitTopics()

Fit topics run the multi-core version of LDA on AWS, (workers = 5) is similar to the n_jobs notation 

# Appendix A: Topic Modeling Runtime Class (to keep everything together)

```
import pandas as pd
import datetime
import numpy as np
import time
import cPickle as pickle
from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


class topicModeling(object):
    margin = '\t\t'
    def verboseMsg(self, msg):
        if self.verbose:
            print self.margin, msg
        
    def __init__(self, alltext, verbose=0):
        self.sometext = alltext
        self.verbose = verbose
    
    def loadText(self,alltext):
        self.sometext = alltext
    
    def setVectorizer(self,vectorizer):
        self.vectorizer = vectorizer
    
    def setCvec(self,ngram_range):
        self.vectorizer = CountVectorizer(stop_words='english', ngram_range=ngram_range)
    
    def setTvec(self,ngram_range):
        self.vectorizer = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
        
    def fitText(self):
        start = datetime.datetime.now()
        self.verboseMsg('text===>word start: fitting text of size %d documents' % (len(self.sometext)))
            
        self.X = self.vectorizer.fit_transform(self.sometext)
        
        self.verboseMsg('text===>word summarizing vocab and creating corpus ')
        
        #prepping summary
        self.df_X = pd.DataFrame(self.X.toarray(), columns=self.vectorizer.get_feature_names())
        self.wordFreq = self.df_X.sum().sort_values(ascending = False)        
    
        self.vocab = {v: k for k, v in self.vectorizer.vocabulary_.iteritems()}
        self.corpus = matutils.Sparse2Corpus(self.X, documents_columns=False)    
        
        self.verboseMsg('text===>word complete. ')
        print datetime.datetime.now() - start
            
    def fitTopics(self,topic_ct,passes):
        start = datetime.datetime.now()        
        self.topic_ct = topic_ct
        self.passes = passes
        
        self.verboseMsg('worp===>%d topics, %d passes: start ' %(topic_ct,passes))
        self.lda = models.LdaMulticore(
            self.corpus,
            num_topics  =  self.topic_ct,
            passes      =  passes,
            id2word     =  self.vocab,
            workers = 4,
            iterations = 2500,
            eval_every = 100,
            chunksize = 2000
        )
        self.verboseMsg('worp===>%d topics, %d passes: lda model complete ' %(topic_ct,passes))
        
        self.topic_vectors = self.lda.print_topics(num_topics=self.topic_ct, num_words=8)

        self.topic_proba = []
        for x in self.corpus:
            local = self.lda.get_document_topics(x)
            row = { x:float(0) for x in range(self.topic_ct)}
            for y in local:
                row[y[0]] = y[1]
            self.topic_proba.append(row)

        self.verboseMsg('worp===>%d topics, %d passes: creating probabilities in dataframe ' %(topic_ct,passes))
        
        self.topic_proba_df = pd.DataFrame(self.topic_proba)
    
        self.verboseMsg('worp===>%d topics, %d passes: complete ' %(topic_ct,passes))
        print datetime.datetime.now() - start
    def printTopics(self):
        for topic in self.topic_vectors:
            print '========================================'
            print 'topic number:', topic[0]
            for y in topic[1].split('+'):
                print '\t',y

    def save(self,prefix='_'):
        suffix = str(int(time.mktime(datetime.datetime.now().timetuple())))[-6:]
        with open("z003_"+prefix+"_topic_proba_df_dict_"+suffix+".p",'wb') as f:
            pickle.dump(self.topic_proba_df,f)
        with open("z003_"+prefix+"_topic_vectors_"+suffix+".p",'wb') as f:
            pickle.dump(self.topic_vectors,f)
        with open("z003_"+prefix+"_X_"+suffix+".p",'wb') as f:
            pickle.dump(self.df_X,f)
```

# Appendix B: AWS Run Code 

```
import pandas as pd
import datetime
import numpy as np
import cPickle as pickle
from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from topicModelingClass import topicModeling


#=========================================================================================================
#=========================================================================================================
#=========================================================================================================
# main run time

with open('../jNotebooks/master_total_df.p','rb') as f:
    master_total_df = pickle.load(f)
master_total_df.head(2)
alltext = master_total_df['jobdesc'].values


print '\n============= Count Vec'
tM = topicModeling(alltext[:100],verbose=1)
vec = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=3,passes=20)


print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()


print '\n============= Tfidf'
tM = topicModeling(alltext[:100],verbose=1)
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=3,passes=3)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()



print '\n============= Count X 22,000'
tM = topicModeling(alltext,verbose=1)
vec = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=5,passes=25)
tM.printTopics()
tM.save('ALL22k05')


print '\n============= Count X 10,000'
tM.fitTopics(topic_ct=10,passes=25)


print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()
tM.save('ALL22k10')



print '\n============= Count X 10,000'
tM.fitTopics(topic_ct=25,passes=25)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()
tM.save('ALL22k25')


print '\n============= Count X 10,00 ngram 2'
tM = topicModeling(alltext,verbose=1)
vec = CountVectorizer(stop_words='english', ngram_range=(2,2), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=10,passes=25)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
tM.printTopics()
tM.save('ALL22k2G25')
```

# Appendix C: AWS Console Response


```
============= Count Vec
		text===>word start: fitting text of size 100 documents
		text===>word summarizing vocab and creating corpus
		text===>word complete.
0:00:00.056617
		worp===>3 topics, 20 passes: start
		worp===>3 topics, 20 passes: lda model complete
		worp===>3 topics, 20 passes: creating probabilities in dataframe
		worp===>3 topics, 20 passes: complete
0:00:28.173462

============ word frequencies ==================
data           478
experience     406
business       322
work           238
team           227
skills         224
ability        172
analytics      158
development    157
management     152
marketing      150
solutions      148
support        145
years          132
strong         128
technical      125
systems        122
knowledge      119
new            118
design         113
dtype: int64

============ topics found === ==================
========================================
topic number: 0
	0.015*experience
	 0.009*business
	 0.009*team
	 0.009*marketing
	 0.008*development
	 0.007*skills
	 0.006*solutions
	 0.006*work
========================================
topic number: 1
	0.020*data
	 0.014*business
	 0.013*experience
	 0.010*work
	 0.009*skills
	 0.008*management
	 0.007*ability
	 0.007*project
========================================
topic number: 2
	0.025*data
	 0.011*experience
	 0.008*business
	 0.008*team
	 0.007*analytics
	 0.007*work
	 0.007*product
	 0.007*skills

============= Tfidf
		text===>word start: fitting text of size 100 documents
		text===>word summarizing vocab and creating corpus
		text===>word complete.
0:00:00.061992
		worp===>3 topics, 3 passes: start
		worp===>3 topics, 3 passes: lda model complete
		worp===>3 topics, 3 passes: creating probabilities in dataframe
		worp===>3 topics, 3 passes: complete
0:00:03.488978

============ word frequencies ==================
data           9.255721
experience     6.865504
business       6.360939
marketing      5.236011
work           4.318256
skills         4.071426
team           3.899816
ability        3.454123
development    3.273708
analytics      3.246132
management     3.202498
product        3.198152
systems        3.158345
solutions      3.124353
support        3.093322
software       3.025688
client         3.024107
strong         2.862458
design         2.831874
years          2.788880
dtype: float64

============ topics found === ==================
========================================
topic number: 0
	0.002*marketing
	 0.002*data
	 0.002*business
	 0.002*systems
	 0.001*experience
	 0.001*solutions
	 0.001*team
	 0.001*skills
========================================
topic number: 1
	0.004*data
	 0.003*experience
	 0.003*business
	 0.002*marketing
	 0.002*product
	 0.002*skills
	 0.002*work
	 0.002*team
========================================
topic number: 2
	0.002*data
	 0.002*experience
	 0.002*business
	 0.002*project
	 0.002*work
	 0.001*job
	 0.001*content
	 0.001*required

============= Count X 22,000
		text===>word start: fitting text of size 22707 documents
		text===>word summarizing vocab and creating corpus
		text===>word complete.
0:00:18.749236
		worp===>5 topics, 25 passes: start
		worp===>5 topics, 25 passes: lda model complete
		worp===>5 topics, 25 passes: creating probabilities in dataframe
		worp===>5 topics, 25 passes: complete
0:25:45.134110
========================================
topic number: 0
	0.022*business
	 0.013*experience
	 0.012*management
	 0.010*requirements
	 0.009*skills
	 0.008*project
	 0.008*work
	 0.007*ability
========================================
topic number: 1
	0.023*experience
	 0.014*software
	 0.014*data
	 0.011*learning
	 0.010*systems
	 0.010*development
	 0.010*machine
	 0.008*engineering
========================================
topic number: 2
	0.057*data
	 0.019*business
	 0.016*experience
	 0.012*analytics
	 0.010*analysis
	 0.009*skills
	 0.007*ability
	 0.006*work
========================================
topic number: 3
	0.010*status
	 0.009*employment
	 0.009*work
	 0.007*opportunity
	 0.007*company
	 0.006*disability
	 0.006*information
	 0.006*equal
========================================
topic number: 4
	0.015*experience
	 0.012*team
	 0.011*data
	 0.010*work
	 0.009*product
	 0.009*marketing
	 0.006*new
	 0.006*digital
```