In [5]:
import nltk

In [6]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Load Data

In [7]:
import json
with open('nv_leg_nram.json', 'r') as file:
    data = json.load(file)

In [8]:
type(data)

dict

In [9]:
raw = json.dumps(data) # convert json to string

In [10]:
type(raw)

str

### Counting

In [20]:
length = len(raw) # the length of the text including punctuation

In [19]:
len(set(raw)) # number of distinct words

82

In [16]:
def lexical_diversity(text):
    '''the number of distinct words / the total number of words'''
    return len(set(text)) / len(text)

In [17]:
lexical_diversity(raw)

0.0004281581887864326

In [23]:
def percentage(count, total):
    '''count how often a word occurs in a text'''
    return 100 * count / total

In [24]:
percentage(raw.count('the'), length) # count the frequency of 'the'

1.0166146263014442

## Processing Raw Text

### Tokenization

Break up the string into words and punctuation, and create a list of words and punctuation.

In [44]:
from nltk import word_tokenize

In [41]:
text = [word.lower() for word in word_tokenize(raw)]

In [42]:
type(text)

list

### Stopwords

Stopwords are non-content words that primarily has only grammatical function

In [43]:
from nltk.corpus import stopwords

In [46]:
stopwords_en = set(stopwords.words('english')) # set checking is faster than list
print(stopwords_en)

{'at', 'ourselves', "doesn't", 'under', "wasn't", 'most', "didn't", "won't", 'has', 'couldn', 'if', 'mustn', 'shouldn', 'does', 'and', "needn't", 'yourselves', 'aren', 'both', 'over', "you're", 'had', 'a', 'nor', 'having', 'again', 'haven', 'am', 'as', 'they', 'are', 'been', 'that', 'yours', 'these', 'your', 'other', 'now', "mustn't", "couldn't", 'm', "isn't", 'my', 'his', "it's", 'd', 'be', 'there', 'against', 'own', 'all', 'we', 'doing', 'where', 'mightn', 'hadn', "don't", 'between', 'by', "shan't", 'before', 'such', 'an', 'who', 'them', 'this', 'because', 'off', 'myself', 'not', 'too', "hadn't", 'in', 'won', 'our', 'o', "you've", 'wasn', 'few', 'some', 'ain', 'you', 'she', 's', 'herself', 'into', 'no', 'weren', 'while', 'll', 'do', "shouldn't", 'itself', 'those', 'same', 't', "hasn't", 'themselves', 'hers', 'the', "should've", 'himself', 'until', 'for', 'i', 'why', "she's", 'below', 'did', 'more', 'of', 'once', 'being', 'when', 'it', 'further', 'so', 'isn', 'have', "wouldn't", 'woul

Ofen, we want to remove stopwords when we want to capture only the gist of the documemt

In [48]:
text_no_stopwords = [word for word in text if word not in stopwords_en]

Often, we want to remove the punctuations from the documents too.

In [49]:
from string import punctuation
print('From string.punctuation:', type(punctuation), punctuation)

From string.punctuation: <class 'str'> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [50]:
text_no_stopwords_punc = [word for word in text_no_stopwords if word not in punctuation]

### Lemmatization
Trying to find the root word with linguistics rules (with the use of regexes)

In [51]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

In [52]:
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(text)]

In [54]:
lemmatize_sent(text_no_stopwords_punc)

['``',
 '0',
 "''",
 '``',
 '\\nminutes',
 'meet',
 '\\nof',
 '\\nassembly',
 'committee',
 'education',
 '\\n',
 '\\nseventy-eighth',
 'session',
 '\\nfebruary',
 '23',
 '2015',
 '\\n',
 '\\nthe',
 'committee',
 'education',
 'call',
 'order',
 'vice',
 'chair',
 'lynn',
 'd.',
 'stewart',
 '\\nat',
 '3:15',
 'p.m.',
 'monday',
 'february',
 '23',
 '2015',
 'room',
 '3142',
 '\\nlegislative',
 'build',
 '401',
 'south',
 'carson',
 'street',
 'carson',
 'city',
 'nevada',
 '\\nmeeting',
 'videoconferenced',
 'room',
 '4406',
 'grant',
 'sawyer',
 'state',
 'office',
 '\\nbuilding',
 '555',
 'east',
 'washington',
 'avenue',
 'la',
 'vega',
 'nevada',
 'copy',
 '\\nminutes',
 'include',
 'agenda',
 'exhibit',
 'attendance',
 'roster',
 'exhibit',
 'b',
 '\\nand',
 'substantive',
 'exhibit',
 'available',
 'file',
 'research',
 'library',
 '\\nof',
 'legislative',
 'counsel',
 'bureau',
 'nevada',
 'legislature',
 "'s",
 'website',
 '\\nwww.leg.state.nv.us/app/nelis/rel/78th2015',
 'add

In [37]:
nltk.pos_tag(text)

[('{', '('),
 ('``', '``'),
 ('0', 'CD'),
 ("''", "''"),
 (':', ':'),
 ('``', '``'),
 ('\\nminutes', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('meeting', 'NN'),
 ('\\nof', 'VBD'),
 ('the', 'DT'),
 ('\\nassembly', 'NNP'),
 ('committee', 'NN'),
 ('on', 'IN'),
 ('education', 'NN'),
 ('\\n', 'NNP'),
 ('\\nseventy-eighth', 'JJ'),
 ('session', 'NN'),
 ('\\nfebruary', 'JJ'),
 ('23', 'CD'),
 (',', ','),
 ('2015', 'CD'),
 ('\\n', 'SYM'),
 ('\\nthe', 'NNP'),
 ('committee', 'NN'),
 ('on', 'IN'),
 ('education', 'NN'),
 ('was', 'VBD'),
 ('called', 'VBN'),
 ('to', 'TO'),
 ('order', 'NN'),
 ('by', 'IN'),
 ('vice', 'NN'),
 ('chair', 'NN'),
 ('lynn', 'JJ'),
 ('d.', 'JJ'),
 ('stewart', 'NN'),
 ('\\nat', 'VBZ'),
 ('3:15', 'CD'),
 ('p.m.', 'NN'),
 ('on', 'IN'),
 ('monday', 'NN'),
 (',', ','),
 ('february', 'JJ'),
 ('23', 'CD'),
 (',', ','),
 ('2015', 'CD'),
 (',', ','),
 ('in', 'IN'),
 ('room', 'NN'),
 ('3142', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('\\nlegislative', 'JJ'),
 ('building', 'NN'),
 (',', ','