# Generate Mean Embeddings for Confirmation Hearing Text
This python notebook generates the mean text embeddings over all the words in the confirmation hearings of each jusice. It outputs these mean text embeddings (with dimension 300) as an array, where each row corresponds to a justice.

In [90]:
# http://yaronvazana.com/2018/09/20/average-word-vectors-generate-document-paragraph-sentence-embeddings/
import os
import string
import pickle
import numpy as np
import pandas as pd

import gensim
import nltk
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
from gensim.models import word2vec
import multiprocessing

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
get_ipython().magic('matplotlib inline')

[nltk_data] Downloading package punkt to /Users/jhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jhuang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# Get file names
in_dir = "nlp_approach/data/transcripts/utterances"
files = os.listdir(in_dir)
files.sort()
new_files = []
justicenames = []
for f in files:
    if f.endswith(".txt"):
        new_files.append(os.path.join(in_dir,f))
        justicenames.append(f[:-4])

print(justicenames)

['AJGoldberg', 'AMKennedy', 'AScalia', 'BRWhite', 'DHSouter', 'EKagan', 'JGRoberts', 'JPStevens', 'LFPowell', 'NMGorsuch', 'RBGinsburg', 'SAAlito', 'SDOConnor', 'SGBreyer', 'SSotomayor', 'TMarshall', 'WHRehnquist']


## Load pre-trained word embedding model.

In [69]:
# Load Google's pre-trained Word2Vec model.
w = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

2019-05-10 22:06:14,164 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin.gz
2019-05-10 22:09:21,423 : INFO : loaded (3000000, 300) matrix from GoogleNews-vectors-negative300.bin.gz


In [73]:
w.save("pretrained_word2vec_model")

2019-05-10 22:10:53,272 : INFO : saving Word2VecKeyedVectors object under pretrained_word2vec_model, separately None
2019-05-10 22:10:53,273 : INFO : storing np array 'vectors' to pretrained_word2vec_model.vectors.npy
2019-05-10 22:11:33,382 : INFO : not storing attribute vectors_norm
2019-05-10 22:12:00,895 : INFO : saved pretrained_word2vec_model


## Process documents.

In [98]:
# source: https://github.com/sdimi/average-word2vec/blob/master/avg_word2vec_from_documents.py
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def get_corpus(files):
    corpus = []
    for f in files:
        with open(f, 'r') as file:
            data = file.read().replace('\n', '')
        data = preprocess(data)
        corpus.append(data)
    return corpus

In [100]:
corpus = get_corpus(new_files)
print(corpus[0])

['chairman', 'would', 'like', 'make', 'one', 'two', 'corrections', 'biography', 'chamman', 'right', 'received', 'college', 'education', 'junior', 'city', 'chicago', 'public', 'institution', 'calledcrane', 'junior', 'college', 'city', 'college', 'city', 'attended', 'college', 'also', 'moonlighted', 'addition', 'attendance', 'juniorcollege', 'took', 'additional', 'course', 'could', 'get', 'law', 'school', 'withthe', 'proper', 'credits', 'depaul', 'university', 'university', 'located', 'degrees', 'northwestern', 'university', 'law', 'school', 'bachelor', 'science', 'law', 'instead', 'received', 'jurisdoctor', 'inducted', 'army', 'inducted', 'major', 'inducted', 'captain', 'promoted', 'rank', 'army', 'career', 'listed', 'captain', 'army', 'promoted', 'major', 'never', 'promoted', 'colonel', 'iwas', 'put', 'reserves', 'much', 'later', 'recently', 'corrected', 'biographical', 'sketch', 'goldberg', 'follows', 'arthur', 'goldbergborn', 'august', 'chicago', 'crane', 'junior', 'college', 'depaul

## Get mean embeddings.

In [101]:
# source: http://yaronvazana.com/2018/09/20/average-word-vectors-generate-document-paragraph-sentence-embeddings/
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [84]:
vectors = []
for doc in corpus:
    vec = get_mean_vector(w, doc)
    vectors.append(vec)
print(len(vectors))
np.save("mean_embeddings.npy", vectors)

17
