#word2vec Text Analysis of Obergefell v. Hodges

In [18]:
import pandas as pd
import re

import nltk
# nltk.download()
from nltk.corpus import stopwords

In [19]:
file = open('raw_text/majority_opinion.txt', 'r')
majority_opinion = file.read()

In [32]:
# Converts a document to a sequence of words, optionally removing stop words. Returns a list of words.
def to_wordlist(raw, remove_stopwords=False):
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]"," ", raw)
    
    # Convert words to lower case and split them
    words = letters_only.lower().split()
    #
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # Return a list of words
    return words

In [66]:
# Splits a document into parsed sentences
def to_sentences(raw, tokenizer, remove_stopwords=False):
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(raw.decode('utf-8').strip())
    
    # Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(to_wordlist(raw_sentence, False))

    # Return the list of sentences
    return sentences

In [67]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Generate sentences
sentences = to_sentences(majority_opinion, tokenizer)

In [68]:
print sentences[0]
print len(sentences)

[u'the', u'constitution', u'promises', u'liberty', u'to', u'all', u'within', u'its', u'reach', u'a', u'liberty', u'that', u'includes', u'certain', u'specific', u'rights', u'that', u'allow', u'persons', u'within', u'a', u'lawful', u'realm', u'to', u'define', u'and', u'express', u'their', u'identity']
423


In [69]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Word vector dimensionality   
num_features = 300
# Minimum word count  
min_word_count = 5
# Number of threads to run in parallel
num_workers = 4
# Context window size     
context = 10   
# Downsample setting for frequent words
downsampling = 1e-3 

# Initialize and train the model
from gensim.models import word2vec

print "Training model..."
model = word2vec.Word2Vec(
    sentences, 
    workers=num_workers, 
    size=num_features, 
    min_count = min_word_count, 
    window = context, 
    sample = downsampling)

# Call init_sims to make the model more memory-efficient (if no further training required)
model.init_sims(replace=True)

# Save the model for later use. Can be loaded later using Word2Vec.load()
model_name = "majority_opinion_300features_40minwords_10context"
model.save(model_name)
print "Done"

Training model...
Done


In [71]:
print model.most_similar("love")

[(u'of', 0.9975085258483887), (u'same', 0.9973676204681396), (u'it', 0.9973558187484741), (u'sex', 0.9973122477531433), (u'in', 0.997212290763855), (u'marry', 0.9971086978912354), (u'state', 0.9970664978027344), (u'as', 0.9970414638519287), (u'the', 0.997032642364502), (u'right', 0.997018039226532)]
