In [1]:
import os
import re
import pdfkit
import urllib2
import pandas as pd
import graphlab as gl
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from bps_utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470679441.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [2]:
os.chdir('../data/')

## Get President Sukarno's Jogjakarta Speech
* [19th December 1961](http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html)  
  Jogjakarta, Indonesia

* Chris Albon tutorial on [beautiful soup](http://chrisalbon.com/python/beautiful_soup_html_basics.html)

In [3]:
sukarno_jogyakarta_url = 'http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html'
sukarno_jogyakarta_url_requests = requests.get(sukarno_jogyakarta_url)
sukarno_jogyakarta_url_html_content = sukarno_jogyakarta_url_requests.text
sukarno_jogyakarta_url_soup = BeautifulSoup(sukarno_jogyakarta_url_html_content, 'lxml')
sukarno_jogyakarta_speech_text = sukarno_jogyakarta_url_soup.get_text().encode('utf8').replace('\n\n','')
sukarno_jogyakarta_speech_unicode = UnicodeDammit(sukarno_jogyakarta_speech_text, ["ascii"])
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech_unicode.unicode_markup.encode('ascii', 'ignore').decode('ascii')
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech.replace("\n","").replace("\r", "")

* subset the speech to remove citation material in document heading and ending

In [4]:
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech[693:len(sukarno_jogyakarta_speech)-353]
sukarno_jogyakarta_speech[0:100]

u'As was said by the Sultan just now, today, it is exactly 15 yearssince the day on which the city of '

* save the speech to a text file 

In [5]:
f = open('sukarno_jogyakarta_speech.txt', 'w')
f.write(sukarno_jogyakarta_speech.encode('utf8'))
f.close()

In [6]:
#need wkhtmltox to work
#pdfkit.from_url('http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html', 'sukarno_jogjakarta_speech.pdf')

* Convert the text to a pandas dataframe 
* easy to transform pandas dataframe to graphlab sframe
* for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [7]:
speech_data = {'speech': sukarno_jogyakarta_speech}
index= [1]
speech_df = pd.DataFrame(data = speech_data, index = index)

In [8]:
BASE_DIR = "~/repos/statistics-indonesia-python/text_analysis/data" 

## Bag-of-words

* each document is represented by a map where the words are keys and the values are the number of occurrences.
* use pandas data frame to create graphlab sframe
* save sframe 
* transform sframe to bag-of-words (bow) model

In [9]:
sf = gl.SFrame(data=speech_df)
sf.save("%s/sukarno_jogyakarta.sframe" % BASE_DIR)

In [10]:
bow = gl.text_analytics.count_words(sf['speech'])
sf['bow'] = bow
bow

dtype: dict
Rows: 1
[{'wasplayed': 1, 'ofall': 1, 'unfurl': 6, 'shallgive': 1, 'up,': 4, 'colonisation,': 1, 'colonisation.': 1, "mook's": 1, 'under': 11, 'theindonesian': 5, 'now,that': 1, 'every': 1, 'asia,': 1, 'today,': 2, 'strangleholdof': 1, 'merauke,': 1, 'completed.of': 1, 'therepresentatives': 1, 'politicalparties,': 1, 'enjoy': 1, 'resolution.in': 1, 'leaders': 3, 'miseries,although': 1, 'me,': 1, 'me.': 1, 'whichthose': 1, 'second': 3, 'pledgesince': 1, 'others.': 1, 'air': 1, 'even': 6, 'colonialism': 1, 'religious': 1, 'dr.': 3, 'new': 4, 'officially': 1, 'admiral': 1, 'told': 2, 'sukarno': 4, '"at': 1, 'here': 5, 'hundreds': 5, 'met': 1, 'active': 1, 'police.': 1, 'dressthemselves': 1, 'military': 9, 'settled': 1, 'deceived,': 2, 'ofour': 1, 'brought': 1, 'total': 2, 'would': 8, 'army': 1, 'old.yes,': 1, 'republic.these': 1, 'receivedmy': 1, 'call': 2, 'therefore': 1, '?even': 1, 'right,': 1, 'until': 2, 'hereby': 2, 'hold': 2, 'must': 2, 'me': 5, 'pursue': 2, 'yearsago':

## TF-IDF

* Another useful representation for text data is called TF-IDF (term frequency - inverse document frequency).
* This is a modification of the bag-of-words format where the counts are transformed into scores: 
    * words that are common across the document corpus are given low scores,  
    rare words occurring often in a document are given high scores

* TF-IDF(word,document)=N(word,document)∗log(1/∑dN(word,d)))
    * where N(w, d) is the number of times word w occurs in document d

In [11]:
sf['tfidf'] = gl.text_analytics.tf_idf(sf['bow'])
sf['tfidf']

dtype: dict
Rows: 1
[{'wasplayed': 0.0, 'ofall': 0.0, 'unfurl': 0.0, 'shallgive': 0.0, 'up,': 0.0, 'colonisation,': 0.0, 'colonisation.': 0.0, "mook's": 0.0, 'under': 0.0, 'theindonesian': 0.0, 'now,that': 0.0, 'every': 0.0, 'asia,': 0.0, 'today,': 0.0, 'strangleholdof': 0.0, 'merauke,': 0.0, 'completed.of': 0.0, 'therepresentatives': 0.0, 'politicalparties,': 0.0, 'enjoy': 0.0, 'resolution.in': 0.0, 'leaders': 0.0, 'miseries,although': 0.0, 'me,': 0.0, 'me.': 0.0, 'whichthose': 0.0, 'second': 0.0, 'pledgesince': 0.0, 'others.': 0.0, 'air': 0.0, 'even': 0.0, 'colonialism': 0.0, 'religious': 0.0, 'dr.': 0.0, 'new': 0.0, 'officially': 0.0, 'admiral': 0.0, 'told': 0.0, 'sukarno': 0.0, '"at': 0.0, 'here': 0.0, 'hundreds': 0.0, 'met': 0.0, 'active': 0.0, 'police.': 0.0, 'dressthemselves': 0.0, 'military': 0.0, 'settled': 0.0, 'deceived,': 0.0, 'ofour': 0.0, 'brought': 0.0, 'total': 0.0, 'would': 0.0, 'army': 0.0, 'old.yes,': 0.0, 'republic.these': 0.0, 'receivedmy': 0.0, 'call': 0.0, 'there

## Text cleaning

* remove all words do not occur at least twice in each document using

In [12]:
docs = sf['bow'].dict_trim_by_values(2)

* GraphLab Create also contains a helper function called stopwords that returns a list of common words.
We can use SArray.docs.dict_trim_by_keys to remove these words from the documents as a preprocessing step.
NB: Currently only English words are available.

In [13]:
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

In [14]:
docs[0]

{'"papua': 4,
 '"state': 5,
 '-': 17,
 '16': 2,
 '17th': 5,
 '1945': 3,
 '1947,': 3,
 '1949,': 4,
 '19th': 4,
 '21st': 3,
 '27th': 4,
 ':': 3,
 '?': 2,
 'act.': 2,
 'action': 7,
 'actions': 2,
 'ago': 2,
 'all,': 2,
 'almighty': 3,
 'ambassadors': 2,
 'anthem".': 2,
 'archipelago': 2,
 'arenot': 2,
 'argument': 2,
 'armed': 10,
 'association': 2,
 'attention': 2,
 'august': 4,
 'authority': 9,
 'beenreturned': 2,
 'before.': 2,
 'began': 2,
 'begun': 2,
 'bothered': 2,
 'bring': 8,
 'call': 2,
 'called': 6,
 'carried': 2,
 'carrying': 5,
 'change': 2,
 'chief': 2,
 'city': 2,
 'command': 13,
 'command,': 2,
 'command?': 2,
 'commander': 3,
 'common': 3,
 'completely': 2,
 'conference': 2,
 'conference,': 3,
 'confirm': 3,
 'confront': 5,
 'confrontation': 3,
 'continuing': 9,
 'continuingin': 2,
 'council': 3,
 'countries,': 2,
 'de': 2,
 'deal': 3,
 'deceived,': 2,
 'december': 4,
 'december,': 3,
 'deeper,': 3,
 'defeat': 7,
 'defeated': 2,
 'defence': 2,
 'defend': 2,
 'diplomacy,':

* the tokenizer transforms each row into an ordered list of strings that represents the a simpler version of the Penn-Tree-Bank-style (PTB-style) tokenization of that row's document. 
* The representation of a document provided by PTB-style of tokenization is essential for sequence-tagging, parsing, bag-of-words treatment, and any text analytics task that requires word-level granularity. For a description of this style of tokenization, see [tokenization example](https://www.cis.upenn.edu/~treebank/tokenization.html.)

In [15]:
tokenized_speech = gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['speech'])
tokenized_speech

tokens
"[As, was, said, by, the, Sultan, just, now,, ..."


## Part of Speech Extraction

* Highlight unique nouns in your text, identify adjectives with the high sentiment scores, or pull out nouns to generate candidate entities. 
The extract_parts_of_speech method parses the text in each element and extracts the words that are a given part of speech. 
For instance, to find all instances of adjectives:

In [16]:
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['speech'],
                                                  chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech

adjectives
"{'ADJ': {'all': 4, 'true': 2, 'done?Oh': 1, ..."


## Sentence Splitting

* The sentence splitter splits by sentence and outputs a list of sentences. 
This aids in anlysis at the sentence level.
For example, you may want a sentiment score for each sentence in a document.
The following command accomplishes this for you:

In [17]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['speech'])
sentences

sent
"[As was said by the Sultan just now, today, ..."


## Create a topic model

In [18]:
docs = gl.text_analytics.count_words(sf['speech'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

* Learn topic model

In [19]:
model = gl.topic_model.create(docs)

In [20]:
print model.get_topics()

+-------+------------+-----------------+
| topic |    word    |      score      |
+-------+------------+-----------------+
|   0   |   irian    | 0.0590542601874 |
|   0   |    red     | 0.0220091523208 |
|   0   | indonesia, | 0.0198300283286 |
|   0   |   white    | 0.0198300283286 |
|   0   |  nations   | 0.0176509043365 |
|   1   |   state    | 0.0557286258221 |
|   1   |   nation   | 0.0280373831776 |
|   1   |   years    | 0.0176531671859 |
|   1   |   puppet   | 0.0176531671859 |
|   1   |  papua",   | 0.0176531671859 |
+-------+------------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [21]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [irian, red, indonesia,, w... |
| [state, nation, puppet, ye... |
| [dutch, -, united, now,, a... |
| [indonesia, friends,, nati... |
| [spirit, indonesia., armed... |
| [irian,, flying, military,... |
| [people, flag, struggle, e... |
| [west, round, people,, set... |
| [indonesian, command, yes,... |
| [republic, territory, cont... |
+-------------------------------+
[10 rows x 1 columns]



In [22]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 1309

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0142
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

* To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:


In [23]:
pred = model.predict(docs)

* Combining the above method with standard SFrame capabilities, one can use predict to find documents related to a particular topic


In [24]:
docs_in_topic_0 = docs[model.predict(docs) == 0]

In [25]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 1
[array('d', [0.13122362869198312, 0.07130801687763713, 0.14261603375527426, 0.10253164556962026, 0.07974683544303797, 0.08270042194092828, 0.09873417721518987, 0.10717299578059072, 0.09451476793248945, 0.08945147679324894])]

In [26]:
model['vocabulary']

dtype: str
Rows: 1309
['nation.may', 'land.3.', 'make.2.', 'toexecute', 'land', 'colonialism', 'uponcontinuing', 'haveinstructed', 'context', 'command.[15]we,', 'read', 'indonesianpeople.the', 'executed', 'sign', 'imperialists.i', 'carry', 'letus', 'now,defeat', 'peace,', 'cemeteries', 'continuingin', "hero's", 'lie', 'bymachine-gun', 'killed', 'monginsidi', 'bringthe', 'tjik', 'bondjol.', 'thespirit', 'power', "coen's", 'jan', 'hanjokrokusumo,', 'djelantik.', 'agung', 'i,', 'fight', 'hearts', 'continuing', 'indonesia?i', 'isyour', 'liberation', 'forcesof', 'commandto', "sukarno's", 'support', 'desire', 'expression', 'supreme', 'giving', 'groups', 'ourfighters', 'workers,', 'groups.', 'ofall', 'me.', 'mouthpieceof', 'theauthority', 'urges', 'briefly,', 'die.', 'speculating,oh', 'prayed', 'sukarnodies.', "let's", 'ill', 'and,when', 'already.', 'wereno', 'sukarno.', "that'sonly", 'no!some', 'djokountung', 'speaking.', 'onward!and,', 'helping', 'africa,', 'saidrepeatedly,', 'but,', 'indon

In [27]:
model['topics']

topic_probabilities,vocabulary
"[0.000217912399216, 0.000346140533056, ...",nation.may
"[0.000217912399216, 0.00380754586362, ...",land.3.
"[0.000217912399216, 0.000346140533056, ...",make.2.
"[0.000217912399216, 0.000346140533056, ...",toexecute
"[0.000217912399216, 0.000346140533056, ...",land
"[0.00239703639137, 0.000346140533056, ...",colonialism
"[0.000217912399216, 0.000346140533056, ...",uponcontinuing
"[0.000217912399216, 0.000346140533056, ...",haveinstructed
"[0.00239703639137, 0.000346140533056, ...",context
"[0.00239703639137, 0.000346140533056, ...","command.[15]we,"


* save and load models

In [28]:
os.chdir("../text_analysis/data/")

In [29]:
model.save('sukarno_jogyakarta_model')
sukarno_model = gl.load_model('sukarno_jogyakarta_model')