In [10]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess 
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['aa', 'aaa','nmsa', 'article', 'section', 'section', 'shall', 'state', 
             'page', 'county', 'act', 'dollars', 'new', 
             'public', 'thousand', 'may', 'nmsa', 'services','means', 'hundred', 'chapter', 'plan', 'amended',
             'person', 'mexico', 'one', 'tax', 'laws', 'program', 'board', 'provided', 'general', 'subsection', 
             'year', 'b', 'including', 'design', 'funds', 'service', 'read', 'construct', 'federal', 'provisions',
             'sb', 'purchase', 'within', 'provide', 'sb', 'c', 'hb', 'percent', 'years', 'amount', 'two', 'law', 
            'date', 'upon', 'used', 'made', 'required', 'following', 'five', 'days', 'aamodt', 'ab', 'abaft', 'two',
               'three', 'fifty','fourth', 'included', 'svc', 'sfcsb', 'pursuant', 'department', 'act', 'facility', 'relating',
              'enabling', 'articles', 'enacted', 'legislature', 'district', 'allowing', 'enacting', 'amending', 'requiring', 
               'definitions', 'providing'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [5]:
# Import Dataset
df = pd.read_csv("SummaryText.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,file_name,BillText,BillSummary
0,0,Yr_13RHB0001.txt,An Act Relating To The Legislative Branch Of G...,An Act Relating To The Legislative Branch Of G...
1,1,Yr_13RHB0002.txt,An Act Making General Appropriations And...,An Act Making General Appropriations And...


In [8]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        #sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  
# Convert to list
data = df.BillText.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

['An Act Relating To The Legislative Branch Of Government; Appropriating Funds For The Expense Of The Fifty-First Legislature, First Session, 2013, And For Other Legislative Expenses, Including The Legislative Council Service, The Legislative Finance Committee, The Legislative Education Study Committee, The Senate Rules Committee, The House Chief ClerkS Office And The Senate Chief ClerkS Office; Declaring An Emergency. : Section 1. Session Expenses.-- A. There Is Appropriated From The General Fund For The Expense Of The Legislative Department Of The State Of New Mexico For The First Session Of The Fifty-First Legislature For Per Diem And Mileage Of Its Members, For Salaries Of Employees And For Other Expenses Of The Legislature, Eight Million Six Hundred Thirty-Nine Thousand Seven Hundred Dollars ($8,639,700) Or So Much Thereof As May Be Necessary For Such Purposes. This Section Are As Follows: B. The Expenditures Referred To In Subsection A Of (1) Per Diem For Senators . . . $388,080;

In [34]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=500) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=500)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words and len(word)>3 and word.isalpha()] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words and word] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!c

In [35]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           )

pprint(lda_model.print_topics())

[(28,
  '0.043*"center" + 0.041*"expenditure" + 0.039*"time" + 0.038*"extend" + '
  '0.037*"fiscal" + 0.030*"bond" + 0.027*"appropriation" + 0.026*"senior" + '
  '0.026*"purpose" + 0.026*"upgrade"'),
 (30,
  '0.160*"member" + 0.063*"retirement" + 0.052*"contribution" + 0.050*"credit" '
  '+ 0.045*"salary" + 0.038*"coverage" + 0.030*"retire" + 0.030*"fire" + '
  '0.028*"officer" + 0.024*"police"'),
 (14,
  '0.162*"health" + 0.114*"care" + 0.050*"provider" + 0.048*"medical" + '
  '0.024*"individual" + 0.017*"treatment" + 0.016*"patient" + 0.015*"therapy" '
  '+ 0.014*"physician" + 0.010*"disease"'),
 (42,
  '0.091*"student" + 0.049*"school" + 0.048*"education" + 0.029*"high" + '
  '0.021*"grade" + 0.021*"graduation" + 0.021*"course" + 0.020*"career" + '
  '0.017*"requirement" + 0.017*"academic"'),
 (43,
  '0.114*"information" + 0.055*"electronic" + 0.039*"communication" + '
  '0.035*"address" + 0.031*"identification" + 0.030*"system" + 0.022*"number" '
  '+ 0.022*"access" + 0.021*"govern

In [36]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,37.0,0.695,"emergency, legislative, treasurer, peace, expe...","[legislative, branch, government, appropriate,..."
1,1,15.0,0.4665,"appropriation, agency, target, total, purpose,...","[make, appropriation, authorize, expenditure, ..."
2,2,39.0,0.4219,"fund, fiscal, appropriation, balance, transfer...","[make, appropriation, drinking, water, system,..."
3,3,11.0,0.5332,"body, voting, connection, record, meeting, mem...","[open, meeting, agenda, available, least, hour..."
4,4,14.0,0.6029,"health, care, provider, medical, individual, t...","[health, care, coverage, health, care, purchas..."
5,5,20.0,0.9688,"division, land, grant, survey, merce, liquid, ...","[land, grant, include, tecolote, land, grant, ..."
6,6,20.0,0.5084,"division, land, grant, survey, merce, liquid, ...","[land, grant, land, grant, private, assist, co..."
7,7,8.0,0.4207,"bond, issue, authority, revenue, interest, aut...","[taxation, taxation, revenue, conduct, delinqu..."
8,8,48.0,0.4669,"business, corporation, trust, security, intere...","[corporation, voluntary, designation, benefit,..."
9,9,22.0,0.3593,"insurance, benefit, superintendent, coverage, ...","[insurance, implement, constitutional, mandate..."


In [20]:
!pip install wordcloud --user

Collecting wordcloud
  Downloading wordcloud-1.8.1-cp38-cp38-win_amd64.whl (155 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.1




In [58]:
df_dominant_topic.to_csv('DoinantTopic.csv')

  and should_run_async(code)
