## Load the Libraries

In [68]:
import pandas as pd
import datetime
import numpy as np
import cPickle as pickle
from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict

## Load the cleaned Data

In [69]:
with open('master_total_df.p','rb') as f:
    master_total_df = pickle.load(f)
master_total_df.head(2)

Unnamed: 0,index,company,jobdesc,city,state,title,sourcesite,views,days_posted,post_start_date,link,base_title,parsed_title,parsed_title_i,expanded_title,prefix_title
0,0,Advocate Health Care,"\nAs part of Advocate Health Care, Advocate Ch...",Oak Lawn,IL,Clinical Practice Specialist - 4 Hope,ind,0,,,http://www.indeed.com/viewjob?jk=6244f7f3a4861...,specialist,"[clinical, practice, specialist, , , 4, hope]",[2],practice specialist,practice
1,1,University of Washington Medical Center,\nThe University of Washington (UW) is proud t...,Seattle,WA,WEB DEVELOPER,ind,0,,,http://www.indeed.com/viewjob?jk=3d6a9f18f5301...,developer,"[web, developer]",[1],web developer,web


## Isolate the Text 

In [70]:
alltext = master_total_df['jobdesc'].values

# sample the entire text body 
sometext = alltext[:100]

## Vectorize the text

In [71]:
vectorizer = CountVectorizer(stop_words='english')
#vectorizer = TFIDFVectorizer(stop_words='english')

X = vectorizer.fit_transform(sometext)

### Get the summary total counts

In [72]:
df_X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
summary = df_X.sum().sort_values(ascending = False)
print summary[:20]

data           478
experience     406
business       322
work           238
team           227
skills         224
ability        172
analytics      158
development    157
management     152
marketing      150
solutions      148
support        145
years          132
strong         128
technical      125
systems        122
knowledge      119
new            118
design         113
dtype: int64


## Gensim Prep - setup vocab, and make Corpus from Vectorizer

In [73]:
start = datetime.datetime.now()

corpus = matutils.Sparse2Corpus(X, documents_columns=False)

vocab = {v: k for k, v in vectorizer.vocabulary_.iteritems()}
vocab.items()[:25]

print datetime.datetime.now()-start

0:00:00.003603


## Check the vocab

In [74]:
vocab

{0: u'00',
 1: u'000',
 2: u'02138',
 3: u'10',
 4: u'100',
 5: u'1000',
 6: u'11',
 7: u'12',
 8: u'120',
 9: u'13',
 10: u'139',
 11: u'14',
 12: u'15',
 13: u'150',
 14: u'1575',
 15: u'16',
 16: u'1606420',
 17: u'17',
 18: u'170',
 19: u'18',
 20: u'1828',
 21: u'18m',
 22: u'19',
 23: u'1969',
 24: u'1976',
 25: u'19801required',
 26: u'1st',
 27: u'20',
 28: u'200',
 29: u'2008',
 30: u'201',
 31: u'2010',
 32: u'2011',
 33: u'2012',
 34: u'2013',
 35: u'2014',
 36: u'2015',
 37: u'2016',
 38: u'2017',
 39: u'2020',
 40: u'206',
 41: u'21',
 42: u'210',
 43: u'215',
 44: u'21st',
 45: u'23',
 46: u'230',
 47: u'24',
 48: u'24x7',
 49: u'25',
 50: u'250',
 51: u'26',
 52: u'28',
 53: u'29',
 54: u'2b',
 55: u'2d',
 56: u'2nd',
 57: u'30',
 58: u'300',
 59: u'30m',
 60: u'31',
 61: u'320',
 62: u'35',
 63: u'360',
 64: u'365',
 65: u'380',
 66: u'39',
 67: u'3rd',
 68: u'40',
 69: u'400',
 70: u'401',
 71: u'401k',
 72: u'409',
 73: u'41',
 74: u'45',
 75: u'47',
 76: u'50',
 77: 

## transform corpus into a reference list 

In [75]:
list_corpus = []
for x in corpus:
    list_corpus.append(x)
    
print len(list_corpus), len(list_corpus[0]), len(list_corpus[1]), len(list_corpus[2])

100 309 367 187


# Setup LDA Model for topic modeling

In [119]:
topic_ct = 3
start = datetime.datetime.now()
lda = models.LdaModel(
    corpus,
    # or use the corpus object created with the dictionary in the previous frame!
    # corpus, 
    num_topics  =  topic_ct,
    passes      =  20,
    id2word     =  vocab,
    verbose = True
    # or use the gensim dictionary object!
    # id2word     =  dictionary
)
print datetime.datetime.now()-start

TypeError: __init__() got an unexpected keyword argument 'verbose'

## Summary of Topics

In [127]:
topic_vectors = lda.print_topics(num_topics=3, num_words=10)
for topic in topic_vectors:
    print '========================================'
    print 'topic number:', topic[0]
    for y in topic[1].split('+'):
        print '\t',y

topic number: 0
	0.019*data 
	 0.013*experience 
	 0.009*business 
	 0.008*work 
	 0.007*team 
	 0.006*skills 
	 0.006*development 
	 0.005*ability 
	 0.005*analytics 
	 0.005*management
topic number: 1
	0.007*team 
	 0.007*experience 
	 0.007*product 
	 0.007*solutions 
	 0.006*skills 
	 0.005*business 
	 0.005*data 
	 0.005*marketing 
	 0.005*new 
	 0.004*sales
topic number: 2
	0.013*business 
	 0.009*experience 
	 0.007*work 
	 0.007*skills 
	 0.006*ability 
	 0.005*marketing 
	 0.005*data 
	 0.005*management 
	 0.005*team 
	 0.005*strong


## Transform  Topics into columns

In [78]:
topic_proba = []
for x in corpus:
    local = lda.get_document_topics(x)
    row = { x:float(0) for x in range(topic_ct)}
    for y in local:
        row[y[0]] = y[1]
    topic_proba.append(row)
    

topic_proba_df = pd.DataFrame(topic_proba)
topic_proba_df['sum'] = topic_proba_df.apply(np.sum,axis=1)
print topic_proba_df.shape
print topic_proba_df.head(10)

(100, 4)
          0         1         2       sum
0  0.000000  0.998348  0.000000  0.998348
1  0.998646  0.000000  0.000000  0.998646
2  0.098688  0.000000  0.900039  0.998727
3  0.994124  0.000000  0.000000  0.994124
4  0.358134  0.640354  0.000000  0.998489
5  0.995060  0.000000  0.000000  0.995060
6  0.997292  0.000000  0.000000  0.997292
7  0.148182  0.000000  0.850116  0.998297
8  0.997983  0.000000  0.000000  0.997983
9  0.996809  0.000000  0.000000  0.996809


In [66]:
# check that probabilities add up:


{0: 0.9986444788586617, 1: 0.0, 2: 0.0}

# Functionalize

In [114]:
import pandas as pd
import datetime
import numpy as np
import cPickle as pickle
from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict

class topicModeling(object):
    margin = '\t\t'
    def verboseMsg(self, msg):
        if self.verbose:
            print self.margin, msg
        
    def __init__(self, alltext, verbose=0):
        self.sometext = alltext
        self.verbose = verbose
    
    def loadText(self,alltext):
        self.sometext = alltext
    
    def setVectorizer(self,vectorizer):
        self.vectorizer = vectorizer
    
    def setCvec(self,ngram_range):
        self.vectorizer = CountVectorizer(stop_words='english', ngram_range=ngram_range)
    
    def setTvec(self,ngram_range):
        self.vectorizer = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
        
    def fitText(self):
        self.verboseMsg('text===>word start: fitting text of size %d documents' % (len(self.sometext)))
            
        self.X = self.vectorizer.fit_transform(self.sometext)
        
        self.verboseMsg('text===>word summarizing vocab and creating corpus ')
        
        #prepping summary
        self.df_X = pd.DataFrame(self.X.toarray(), columns=self.vectorizer.get_feature_names())
        self.wordFreq = self.df_X.sum().sort_values(ascending = False)        
    
        self.vocab = {v: k for k, v in self.vectorizer.vocabulary_.iteritems()}
        self.corpus = matutils.Sparse2Corpus(self.X, documents_columns=False)    
        
        self.verboseMsg('text===>word complete. ')
            
    def fitTopics(self,topic_ct,passes):
        self.topic_ct = topic_ct
        self.passes = passes
        
        self.verboseMsg('worp===>%d topics, %d passes: start ' %(topic_ct,passes))
        self.lda = models.LdaModel(
            self.corpus,
            num_topics  =  self.topic_ct,
            passes      =  20,
            id2word     =  self.vocab
        )
        self.verboseMsg('worp===>%d topics, %d passes: lda model complete ' %(topic_ct,passes))
        
        self.topic_vectors = self.lda.print_topics(num_topics=self.topic_ct, num_words=5)

        self.topic_proba = []
        for x in self.corpus:
            local = self.lda.get_document_topics(x)
            row = { x:float(0) for x in range(self.topic_ct)}
            for y in local:
                row[y[0]] = y[1]
            self.topic_proba.append(row)

        self.verboseMsg('worp===>%d topics, %d passes: creating probabilities in dataframe ' %(topic_ct,passes))
        
        self.topic_proba_df = pd.DataFrame(self.topic_proba)
    
        self.verboseMsg('worp===>%d topics, %d passes: complete ' %(topic_ct,passes))
    


In [118]:
with open('master_total_df.p','rb') as f:
    master_total_df = pickle.load(f)
master_total_df.head(2)
alltext = master_total_df['jobdesc'].values



tM = topicModeling(alltext[:100],verbose=1)
vec = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=3,passes=20)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
for y in tM.topic_vectors:
    print y
    
    
tM = topicModeling(alltext[:100],verbose=1)
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,1), min_df =2,max_features=50000)
tM.setVectorizer(vec)
tM.fitText()
tM.fitTopics(topic_ct=3,passes=20)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
for y in tM.topic_vectors:
    print y


tM = topicModeling(alltext[:1000],verbose=1)
tM.setCvec((1,1))
tM.fitText()
tM.fitTopics(topic_ct=3,passes=20)

print '\n============ word frequencies =================='
print tM.wordFreq[:20]
print '\n============ topics found === =================='
for y in tM.topic_vectors:
    print y

		text===>word start: fitting text of size 100 documents
		text===>word summarizing vocab and creating corpus 
		text===>word complete. 
		worp===>3 topics, 20 passes: start 
		worp===>3 topics, 20 passes: lda model complete 
		worp===>3 topics, 20 passes: creating probabilities in dataframe 
		worp===>3 topics, 20 passes: complete 

data           478
experience     406
business       322
work           238
team           227
skills         224
ability        172
analytics      158
development    157
management     152
marketing      150
solutions      148
support        145
years          132
strong         128
technical      125
systems        122
knowledge      119
new            118
design         113
dtype: int64

(0, u'0.016*experience + 0.016*data + 0.012*business + 0.010*team + 0.008*software')
(1, u'0.013*business + 0.013*marketing + 0.011*experience + 0.009*work + 0.009*data')
(2, u'0.027*data + 0.013*experience + 0.007*skills + 0.006*systems + 0.006*solutions')
		text===>wo