In [1]:
import os
import re
import nltk
import logging
import urllib2
import pandas as pd
import graphlab as gl
from gensim.models import word2vec
from utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt, speech_to_wordlist, speech_to_sentences

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1474431676.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [2]:
os.chdir('../data/')

## Get President Sukarno's speeches
* 18th April 1955  
  Bandung, Indonesia
 
 
* 19th December 1961  
  Jogjakarta, Indonesia

In [3]:
get_Sukarno_Bandung_speech()

('sukarno_bandung_speech.pdf', 'already exists')


* Convert the .pdf of speech to text

In [4]:
sukarno_bandung_speech = convert_pdf_to_txt('sukarno_bandung_speech.pdf')

* subset the speech to remove citation material in document heading

In [5]:
sukarno_bandung_speech = sukarno_bandung_speech[1167:len(sukarno_bandung_speech)]
sukarno_bandung_speech[0:829]

'Your Excellencies,Ladies and Gentlemen, Sisters and Brothers.It is my great honour and privilege on this historic day to bid you welcome to Indonesia. On behalf of the people and government of Indonesia - your hosts - I beg your understanding and forbearance if some circumstances in our country do not meet your expectation. We have, I assure you, done our best to make your stay amongst us memorable for both our guests and your hosts. We hope that the warmth of our welcome will compensate for whatever material shortcomings there may be.As I survey this hall and the distinguished guests gathered here, my heart is filled with emotion. This is the first intercontinental conference of coloured peoples in the history of mankind! I am proud that my country is your host. I am happy that you were able to accept the invitations'

## Word2Vector models
* [Word2vec](https://code.google.com/p/word2vec/), published by Google in 2013, is a neural network implementation that learns [distributed representations](http://www.cs.toronto.edu/~bonner/courses/2014s/csc321/lectures/lec5.pdf) for words. Other deep or recurrent neural network architectures had been proposed for learning word representations prior to this, but the major problem with these was the long time required to train the models. Word2vec learns quickly relative to other models.
* clean the speech and convert to a sequence of words

In [6]:
sukarno_bandung_speech_wordlist = speech_to_wordlist(sukarno_bandung_speech)
print sukarno_bandung_speech_wordlist[0:10]

['your', 'excellencies', 'ladies', 'and', 'gentlemen', 'sisters', 'and', 'brothers', 'it', 'is']


In [7]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
sukarno_bandung_speech_sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from bandung speech"
sukarno_bandung_speech_sentences += speech_to_sentences(sukarno_bandung_speech, tokenizer)

Parsing sentences from bandung speech


In [9]:
print sukarno_bandung_speech_sentences[0]

['your', 'excellencies', 'ladies', 'and', 'gentlemen', 'sisters', 'and', 'brothers', 'it', 'is', 'my', 'great', 'honour', 'and', 'privilege', 'on', 'this', 'historic', 'day', 'to', 'bid', 'you', 'welcome', 'to', 'indonesia']


In [10]:
print "sentences in bandung speech: %d" % len(sukarno_bandung_speech_sentences)

sentences in bandung speech: 204


In [11]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print "Training model..."
model = word2vec.Word2Vec(sukarno_bandung_speech_sentences, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
os.chdir('../text_analysis/data/')
model.save(model_name)

Training model...


In [12]:
print model

Word2Vec(vocab=83, size=300, alpha=0.025)


In [13]:
model.vocab.keys()[0:10]

['and', 'all', 'own', 'give', 'is', 'africa', 'life', 'one', 'as', 'been']

In [14]:
model.most_similar("colonialism")

[('each', 0.9954915642738342),
 ('africa', 0.9954866170883179),
 ('the', 0.9954788684844971),
 ('give', 0.9954684972763062),
 ('world', 0.9954482316970825),
 ('let', 0.9954192638397217),
 ('and', 0.9954022169113159),
 ('of', 0.9953722953796387),
 ('that', 0.9953322410583496),
 ('other', 0.9953094720840454)]