# Basic EDA

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from helpers import vocabulary

# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

In [2]:
data = pickle.load(open("./data/data.p", "rb"))

In [3]:
data.head()

Unnamed: 0,date_filed,year_filed,name_first,name_last,political_party,text
0,1946-02-25,1946,Stanley,Reed,d,"[<unk>, paul, <unk>, sweeney, argued, the, cau..."
1,1888-10-22,1888,Joseph,Bradley,r,"[justice, bradley, delivered, the, opinion, <u..."
2,1944-01-10,1944,William,Douglas,d,"[john, j, <unk>, with, whom, mr, leonard, g, <..."
3,1896-05-18,1896,Melville,Fuller,d,"[winn, with, whom, was, mr, fh, </s>, chief, j..."
4,1923-01-02,1923,William,Taft,r,"[<unk>, mr, d, roger, <unk>, with, whom, mr, j..."


In [4]:
print("Total number of Republican opinions:", len(data[data.political_party=='r']))
print("Total number of Democrat opinions:", len(data[data.political_party=='d']))
print("Total number of other party opinions:", len(data[(data.political_party!='d')&(data.political_party!='r')]))
print("Total number of opinions:", len(data))

Total number of Republican opinions: 14080
Total number of Democrat opinions: 7236
Total number of other party opinions: 916
Total number of opinions: 22232


In [5]:
print("Average number of words per opinion:", np.mean(data.text.apply(lambda x: len([w for w in x if w != '</s>']))))
print("Average number of sentences per opinion:", np.mean(data.text.apply(lambda x: sum([1 for w in x if w == '</s>'])-1)))

Average number of words per opinion: 3766.1872076286436
Average number of sentences per opinion: 105.16804605973371


# Top 1000 Words

In [6]:
token_feed =[]
for x in data.text:
    token_feed.extend(x)

In [7]:
# Process tokens into a vocabulary - collects counts of tokens and assign wordids
vocab = vocabulary.Vocabulary(token_feed, size=10000)
print("Vocabulary size: {:,}".format(vocab.size))

# Print out some debugging stats
print("Most common unigrams:")
for word, count in vocab.unigram_counts.most_common(10):
    print("\"{:s}\": {:,}".format(word, count))

Vocabulary size: 10,000
Most common unigrams:
"<unk>": 13,901,896
"the": 7,539,832
"</s>": 2,360,328
"in": 1,890,147
"that": 1,520,469
"is": 935,304
"by": 821,136
"for": 794,471
"it": 731,073
"be": 722,384


In [8]:
# Plot the top frequencies
words, counts = zip(*vocab.unigram_counts.most_common(20))

hover = HoverTool(tooltips=[("word", "@x"), ("count", "@top")], mode="vline")
fig = bp.figure(x_range=words, plot_width=800, plot_height=400, tools=[hover])
fig.vbar(x=words, width=0.8, top=counts, hover_fill_color="firebrick")
fig.y_range.start = 0
fig.yaxis.axis_label = "Count(w)"
bp.show(fig)

In [9]:
pickle.dump(vocab, open("vocab.p", "wb"))