In [2]:
import pandas as pd
import numpy as np
import pickle
import re
from data.helpers import constants, utils, vocabulary

# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

# Basic EDA

In [3]:
data = pickle.load(open("./data/data.p", "rb"))

In [3]:
data.head()

Unnamed: 0,date_filed,year_filed,name_first,name_last,political_party,tokens
104247,1946-02-25,1946,Stanley,Reed,d,"[<s>, DGDGDG, u.s., DGDGDG, (1946)social, secu..."
92550,1888-10-22,1888,Joseph,Bradley,r,"[<s>, DGDGDG, u.s., DGDGDG, (1888)williamsv.co..."
103918,1944-01-10,1944,William,Douglas,d,"[<s>, DGDGDG, u.s., DGDGDG, (1944)city, of, yo..."
94471,1896-05-18,1896,Melville,Fuller,d,"[<s>, DGDGDG, u.s., DGDGDG, (1896)united, stat..."
100084,1923-01-02,1923,William,Taft,r,"[<s>, DGDGDG, u.s., DGDGDG, (1923)blamberg, br..."


In [4]:
print("Total number of Republican opinions:", len(data[data.political_party=='r']))
print("Total number of Democrat opinions:", len(data[data.political_party=='d']))
print("Total number of other party opinions:", len(data[(data.political_party!='d')&(data.political_party!='r')]))
print("Total number of opinions:", len(data))

Total number of Republican opinions: 14064
Total number of Democrat opinions: 7227
Total number of other party opinions: 941
Total number of opinions: 22232


In [5]:
print("Average number of words per opinion:", np.mean(data.tokens.apply(lambda x: len([w for w in x if w != '<s>']))))
print("Average number of sentences per opinion:", np.mean(data.tokens.apply(lambda x: sum([1 for w in x if w == '<s>'])-1)))

Average number of words per opinion: 3981.8442785174525
Average number of sentences per opinion: 148.50427311982727


# Top 1000 Words

In [4]:
token_feed =[]
for x in data.tokens:
    token_feed.extend(x)

In [6]:
token_feed = [w for w in token_feed if w != '<s>']

In [7]:
# Process tokens into a vocabulary - collects counts of tokens and assign wordids
vocab = vocabulary.Vocabulary(token_feed, size=10000)
print("Vocabulary size: {:,}".format(vocab.size))

# Print out some debugging stats
print("Most common unigrams:")
for word, count in vocab.unigram_counts.most_common(10):
    print("\"{:s}\": {:,}".format(word, count))
    
# Additional useful commands to keep in mind:
# vocab dictionary: vocab.word_to_id
# convert tokens to ids: vocab.words_to_ids(token_feed)
# convert ids to tokens: vocab.ids_to_words(array_of_ids)

Vocabulary size: 10,000
Most common unigrams:
"the": 7,554,705
"of": 4,093,832
"to": 2,558,723
"and": 1,977,562
"in": 1,900,256
"that": 1,467,956
"a": 1,460,173
"DGDGDG": 1,005,411
"is": 902,176
"for": 824,400


In [8]:
# Plot the top frequencies
words, counts = zip(*vocab.unigram_counts.most_common(20))

hover = HoverTool(tooltips=[("word", "@x"), ("count", "@top")], mode="vline")
fig = bp.figure(x_range=words, plot_width=800, plot_height=400, tools=[hover])
fig.vbar(x=words, width=0.8, top=counts, hover_fill_color="firebrick")
fig.y_range.start = 0
fig.yaxis.axis_label = "Count(w)"
bp.show(fig)

In [9]:
pickle.dump(vocab, open("vocab.p", "wb"))