# Common code to read and process the corpus

To make sure all models are preprocessing the data in a similar way

In [119]:
# %load budget_corpus.py


# # Common code to read and process the corpus
# 
# To make sure all models are preprocessing the data in a similar way


import pandas as pd
import numpy as np 
import pdb
import json
import re
import pickle

from pathlib import Path

import gensim
from gensim.utils import simple_preprocess
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS as gs_stopwords

docdir = Path('../data/docs')
datadir = Path('../data')


def read_raw_corpus():
    raw_corpus = []

    files = list(docdir.glob('*.body'))
    
    for body in files:
        with open(body, 'r') as fp:
            doc = fp.read()
            fp.close()
        raw_corpus.append(doc)

    return raw_corpus


def tokenize_raw_budget(raw_corpus):
    # -- some of these stopwords (grant/grants) would be less repetitive with 
    # -- stemming, but stemming turns things into 'nonwords' which can't be
    # -- found in word2vec searches
    
    stopwords = set( """subsection amount state grant 
        fund  cost expense expend expenditure purchase account 
        administration administrative
        budget purposes united states national general government
        office regulations act title code 
        specified provided available further including herein
        enactment program service operation operation activity
        appropriation appropriated provision agency""".split() )
    stopwords.update(gs_stopwords)
    
    print(stopwords)
    
    # tla = three (or more) letter all-capitialized acronymns
    tla = re.compile(r'[A-Z]{3,}\S*?\b')
    
    corpus = []
    for doc in raw_corpus:
        body = re.sub(tla, '', doc)
        # from this lemmatization tutorial
        # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
        lemmatized = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(body,  min_length=3)]
        tokens = [ t for t in lemmatized if t not in stopwords ]
        corpus.append(tokens)
        
    return corpus


def read_documents():
    doc_pickle = datadir / 'corpus.pkl'
    if doc_pickle.exists():
        with open(doc_pickle, 'rb') as fp:
            corpus = pickle.load(fp)
    else:
        raw_corpus = read_raw_corpus()
        corpus = tokenize_raw_budget(raw_corpus)
        # put it in a pickle file
        with open(doc_pickle, 'wb') as fp:
            pickle.dump(corpus, fp)
    return corpus



---

The code above this line is in budget_corpus.py.
The rest of the file is just testing.

---

In [121]:
my_corpus = read_documents()

In [122]:
raw_corpus = read_raw_corpus()
token_corpus = tokenize_raw_budget(raw_corpus)

In [123]:
for version1, version2 in zip(my_corpus, token_corpus):
    if version1 != version2:
        pdb.set_trace()

In [115]:
unique_headings = set()

files = list(docdir.glob('*.heading'))
for file in files:
    d = json.load(open(file))
    for key in ['division', 'title', 'major', 'inter', 'small']:
        unique_headings.add(d[key])
        
    

In [116]:
unique_headings

{'',
 '(INCLUDING RESCISSIONS AND TRANSFERS OF FUNDS)',
 '(Including transfer of funds)',
 '(Rescissions)',
 '(including rescission of funds)',
 '(including rescissions) (including transfer of funds)',
 '(including transfer of funds)',
 '(including transfer of funds) (including rescissions)',
 '(including transfers and rescission of funds)',
 '(including transfers and rescissions of funds)',
 '(including transfers of funds)',
 '(including transfers of funds) restriction on use of funds',
 '(liquidation of contract authorization) (highway trust fund)',
 '(rescission)',
 '(rescissions)',
 'AFGHAN SPECIAL IMMIGRANT VISAS',
 'AGRICULTURAL PROGRAMS',
 'ALLOWANCES AND DIFFERENTIALS',
 'ASSET PROCEEDS AND SPACE MANAGEMENT FUND',
 'Abandoned mine reclamation fund',
 'Access board',
 'Acquisition of lands for national forests special acts',
 'Acquisition of lands to complete land exchanges',
 'Additional appropriations',
 'Administration of foreign affairs',
 'Administrative Conference of the U

In [117]:
len(unique_headings)

837