# COLAB PACKAGE SETUP

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
!pip install --upgrade gensim
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip
!pip install pyLDAvis==3.2.2

openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment (build 11.0.13+8-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.13+8-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
--2022-01-28 00:26:25--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://mallet.cs.umass.edu/dist/mallet-2.0.8.zip [following]
--2022-01-28 00:26:25--  https://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
Saving to: ‘mallet-2.0.8.zip.6’


2022-01-28 00:26:27 (9.97 MB/s) - ‘mallet-2.0.8.zip.6’ saved [16184794/16184794]

Archive:  mallet-2.0.8.zip
replace mallet-2.0.8/bin/classifier2info? [y]es, [n]o, [A]

# LOAD PACKAGES

In [9]:
#reference: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
#In terminal, type: conda activate nlp2

import nltk; nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
from gensim import matutils, corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Add stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
### CHOOSE TO ADD NEW STOPWORDS ####
newStopWords = ['section','division','department','ha','le','subdivision','bill','chapter','000','pursuant','ii','iii','article','existing law','whereas','purpose','act','title', 'sb', 'ab', 'acr', 'ajr', 'hr', 'purpose', 'digest', 'key', 'fiscal', 'committee', 'no', 'yes', 'vote', 'majority', 'appropriation', 'local', 'program', 'scr', 'sjr', 'sca', 'aca']
stop_words.extend(newStopWords)

#Sklearn for bag of words
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: ignored

# CREATE TEXT CONTENT

In [None]:
### NAME CHANGE HERE 
df = pd.read_csv('/content/drive/Shared drives/Legislative Text Analysis/relevant.csv')
df.head(10)

Unnamed: 0,year,txt_path
0,2011,./drive/Shared drives/Legislative Text Analysi...
1,2001,./drive/Shared drives/Legislative Text Analysi...
2,2003,./drive/Shared drives/Legislative Text Analysi...
3,2009,./drive/Shared drives/Legislative Text Analysi...
4,2019,./drive/Shared drives/Legislative Text Analysi...
5,2015,./drive/Shared drives/Legislative Text Analysi...
6,2013,./drive/Shared drives/Legislative Text Analysi...
7,2005,./drive/Shared drives/Legislative Text Analysi...
8,2007,./drive/Shared drives/Legislative Text Analysi...
9,2017,./drive/Shared drives/Legislative Text Analysi...


In [None]:
content = []
for row in df.txt_path:
  with open(row, 'r') as f:
    s = f.read()
    s = s.lower()
    s = re.sub(r'\b\d+\b', '', s) #clean standalone numbers
    s = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', s, flags=re.MULTILINE) #clean https
    s = re.sub(r'[^\x00-\x7F]+','', s) #clean non ascii
    s = re.sub('\W', ' ', s) #clean special characters 
    s = [substr for substr in s.split() if len(substr)>1]
    s = ' '.join(s)
    content.append(s)
print(content[0])



# LDA

In [None]:
#source: https://nkharche.github.io/nsf_awards_gensim_v3/#topic=21&lambda=0.51&term=
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
tf_vectorizer = CountVectorizer(#max_features=3000, 
                                ngram_range=(1,3),
                                max_df=0.95,
                                min_df=0.01,
                                stop_words=stop_words,
                                tokenizer=LemmaTokenizer())
tf = tf_vectorizer.fit_transform(content)
tf_feature_names = tf_vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [None]:
vocab = tf_vectorizer.get_feature_names()
bag_id2word = dict([(i, s) for i, s in enumerate(vocab)])
dictionary = corpora.Dictionary([list(bag_id2word.values())])
post_proc_bows = tf_vectorizer.inverse_transform(tf)
bag_corpus = [dictionary.doc2bow(list(text)) for text in post_proc_bows]

In [None]:
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet' 

In [None]:
### CHOOSE NUMBER OF TOPICS HERE 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bag_corpus, num_topics=15, id2word=bag_id2word)

In [None]:
pprint(ldamallet.show_topics(num_topics=10, formatted=False))

[(4,
  [('firefighter', 0.02164617851416355),
   ('vehicle', 0.012560128273650455),
   ('disaster', 0.011491181186531267),
   ('federal', 0.00935328701229289),
   ('result', 0.00935328701229289),
   ('fee', 0.00774986638161411),
   ('san', 0.0074826296098343134),
   ('tax', 0.007215392838054517),
   ('senate', 0.006680919294494923),
   ('october', 0.006413682522715126)]),
 (14,
  [('water', 0.019432370237790847),
   ('greenhouse', 0.018409613909486065),
   ('energy', 0.015341344924571721),
   ('loan', 0.01482996676041933),
   ('public', 0.012273075939657376),
   ('approved', 0.01150600869342879),
   ('improvement', 0.01048325236512401),
   ('bond', 0.010227563283047815),
   ('gas', 0.009971874200971618),
   ('council', 0.009971874200971618)]),
 (0,
  [('flood', 0.019915254237288134),
   ('result', 0.012146892655367232),
   ('federal', 0.012146892655367232),
   ('government', 0.010169491525423728),
   ('control', 0.010169491525423728),
   ('san', 0.0096045197740113),
   ('levee', 0.0096

In [None]:
pyLDAvis.enable_notebook()
lda_conv_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

vis = pyLDAvis.gensim.prepare(lda_conv_model, bag_corpus, dictionary)
vis

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)


In [None]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_conv_model, texts=bag_corpus, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)