In [30]:

import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [31]:
documents = pd.read_csv('corpus.csv')
documents.dropna(inplace=True)

In [32]:
# Use CountVectorizor to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english',
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(documents.text)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


# Use the gensim.models.ldamodel.LdaModel constructor to estimate
# LDA model parameters on the corpus, and save to the variable `ldamodel`

ldamodel = gensim.models.LdaMulticore(corpus=corpus, id2word=id_map, passes=2,
                                               random_state=5, num_topics=25, workers=4)

In [33]:
for idx, topic in ldamodel.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")


Topic: 0 
Words: 0.107*"business" + 0.062*"center" + 0.043*"code" + 0.039*"international" + 0.036*"fibo" + 0.032*"fpml" + 0.029*"ontology" + 0.017*"value" + 0.014*"fbc" + 0.012*"economic"


Topic: 1 
Words: 0.046*"government" + 0.029*"republic" + 0.019*"jurisdiction" + 0.019*"court" + 0.018*"time" + 0.014*"ontology" + 0.012*"shares" + 0.012*"courts" + 0.012*"concept" + 0.011*"constitutional"


Topic: 2 
Words: 0.039*"banks" + 0.038*"rate" + 0.037*"reference" + 0.037*"swap" + 0.027*"trust" + 0.024*"street" + 0.023*"annual" + 0.023*"state" + 0.022*"terms" + 0.016*"specific"


Topic: 3 
Words: 0.077*"share" + 0.040*"issuer" + 0.038*"registered" + 0.032*"preferred" + 0.028*"income" + 0.028*"provides" + 0.027*"confers" + 0.024*"option" + 0.023*"periodic" + 0.022*"common"


Topic: 4 
Words: 0.100*"exchange" + 0.058*"stock" + 0.052*"markets" + 0.029*"state" + 0.028*"europe" + 0.023*"unit" + 0.021*"cboe" + 0.016*"sovereign" + 0.016*"market" + 0.016*"equities"


Topic: 5 
Words: 0.031*"format" 

In [34]:
my_document = documents.text[2]
my_document


'a municipal security is typically a bond, note, warrant, certificate or other similar obligation issued by a state or local government or their agencies or authorities (such as cities, towns, villages, counties or special districts or authorities).  a prime feature of most municipal securities is that interest or other investment earnings on them are generally excluded from gross income of the bondholder for federal income tax purposes. some municipal securities are subject to federal income tax, although the issuers or bondholders may receive other federal tax advantages for certain types of taxable municipal securities. some examples include build america bonds, municipal fund securities and direct pay subsidy bonds.'

In [35]:
def topic_distribution(string_input):
    string_input = [string_input]
    # Fit and transform
    X = vect.transform(string_input)

    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

    output = list(ldamodel[corpus])[0]

    return output



In [36]:
def topic_prediction(my_document):
    string_input = [my_document]
    X = vect.transform(string_input)
    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
    output = list(ldamodel[corpus])[0]
    topics = sorted(output,key=lambda x:x[1],reverse=True)
    return topics[0][0]

In [37]:
topic_prediction(my_document)



19

In [38]:

topic_distribution(my_document)

[(3, 0.08708604), (19, 0.7585633), (24, 0.13805251)]