See this page [https://nbviewer.jupyter.org/github/rare-technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb] for reference. 
Extracting data from XML files

In [18]:
import os, re, glob
import xml.etree.ElementTree as ET

def extract_text(file):
    tree = ET.parse(file)
    root = tree.getroot()
    namespaces = {'dc': 'http://purl.org/dc/elements/1.1/'}
    title = root.find('.//form/official-title').text
    return title + ' ' + ''.join(root.find('.//legis-body').itertext())

def identifier(filename):
    m = re.search('.*-(\d+s\d+).*', filename)
    return m.group(1)
    
docs = []
ids = []
files = glob.glob('data/bills/s/BILLS-115*.xml')
for file in files:    
    id = identifier(file)
    if 'eah.xml' in file or id in ids:
        continue
    ids.append(identifier(file))
    txt = extract_text(file)
    txt = re.sub('\s+', ' ', txt).strip()
    docs.append(txt)  
print(len(docs))    

2212


Creating sponsor-bill mapping

In [19]:
def extract_sponsors(file):
    tree = ET.parse(file)
    sponsors = []
    root = tree.getroot()
    sponsors = [n.text for n in root.findall('.//sponsors/item/bioguideId')]
    sponsors.extend([n.text for n in root.findall(".//cosponsors/item/bioguideId")])
    return sponsors

sponsor2docs = dict() 
status_files = os.listdir('data/status/s')
for file in status_files:
    sponsors = extract_sponsors('data/status/s/' + file)
    id = identifier(file)
    if not id in ids:
        continue
    for sponsor in sponsors:
        if not sponsor2docs.get(sponsor):
            # This is a new sponsor.
            sponsor2docs[sponsor] = []
        sponsor2docs[sponsor].append(id)

# Use an integer ID in author2doc, instead of the IDs provided in the NIPS dataset.
# Mapping from ID of document in NIPS datast, to an integer ID.
id_dict = dict(zip(ids, range(len(ids))))
# Replace NIPS IDs by integer IDs.
for a, a_doc_ids in sponsor2docs.items():
    for i, doc_id in enumerate(a_doc_ids):
        sponsor2docs[a][i] = id_dict[doc_id]        

Use spacy to preprocess files

In [21]:
%%time
import spacy
nlp = spacy.load('en')
processed_docs = []    
for doc in nlp.pipe(docs, n_threads=6, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

CPU times: user 31min 7s, sys: 7min 33s, total: 38min 41s
Wall time: 19min 41s


In [22]:
docs = processed_docs
del processed_docs

In [23]:
# Compute bigrams.
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [24]:
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [25]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [28]:
print('Number of sponsors: %d' % len(sponsor2docs))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))


Number of sponsors: 103
Number of unique tokens: 4877
Number of documents: 2212


In [49]:
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=corpus, num_topics=20, id2word=dictionary.id2token, \
                author2doc=sponsor2docs, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

CPU times: user 9.18 s, sys: 655 ms, total: 9.83 s
Wall time: 3.44 s


In [50]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=20, id2word=dictionary.id2token, \
                    author2doc=sponsor2docs, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

CPU times: user 57min 14s, sys: 4min 58s, total: 1h 2min 13s
Wall time: 15min 54s


In [51]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

Topic coherence: -2.880e+01


In [52]:
model.save('s115-topic20.atmodel')

In [54]:
print(model)

AuthorTopicModel(num_terms=4877, num_topics=20, num_authors=103, decay=0.5, chunksize=2000)


In [56]:
model.print_topics()

[(0,
  '0.035*"plan" + 0.027*"tax" + 0.021*"taxable" + 0.019*"activity" + 0.018*"service" + 0.017*"income" + 0.016*"transfer" + 0.014*"business" + 0.013*"taxable_year" + 0.012*"excess"'),
 (1,
  '0.027*"Fiscal Year 2017" + 0.000*"improvements" + 0.000*"in_addition" + 0.000*"reportsnot_later" + 0.000*"reportsnot" + 0.000*"the Committee on Appropriations of the House of Representatives" + 0.000*"the Committee on Appropriations of the Senate" + 0.000*"l" + 0.000*"gift" + 0.000*"in_general"'),
 (2,
  '0.008*"program" + 0.007*"individual" + 0.007*"service" + 0.006*"national" + 0.006*"agency" + 0.006*"grant" + 0.005*"strike" + 0.005*"information" + 0.005*"public" + 0.005*"entity"'),
 (3,
  '0.072*"fee" + 0.068*"drug" + 0.049*"fiscal" + 0.046*"fiscal_year" + 0.029*"food" + 0.027*"cosmetic" + 0.026*"drug_cosmetic" + 0.026*"federal_food" + 0.025*"the Federal Food, Drug" + 0.024*"strike"'),
 (4,
  '0.000*"Fiscal Year 2017" + 0.000*"record" + 0.000*"strike" + 0.000*"department" + 0.000*"public" +

In [66]:
from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    print('Docs:', model.author2doc[name])
    print('Topics:')
    pprint([topic for topic in model[name]])

In [67]:
show_author('E000285')


E000285
Docs: [1109, 2, 26, 43, 109, 99, 101, 102, 103, 200, 254, 294, 315, 316, 343, 355, 387, 569, 617, 640, 645, 655, 682, 719, 712, 886, 809, 885, 876, 906, 941, 972, 1022, 1066, 1107, 1164, 1175, 1174, 1331, 1299, 1342, 1347, 1377, 1381, 1399, 1409, 1412, 1444, 1448, 1459, 1460, 1496, 1486, 1507, 1524, 1525, 1527, 1537, 1545, 1602, 1671, 1681, 1711, 1712, 1749, 1800, 1856, 1892, 1920, 1921, 1935, 1946, 1993, 2004, 2078, 2146, 2185]
Topics:
[(2, 0.3728403769175701), (5, 0.12001910223605643), (6, 0.50704268494309901)]


In [68]:
print(len(docs))

2212


In [None]:
print(len(sponsors
         ))