In [1]:
import os
import re
import urllib2
import pandas as pd
import graphlab as gl
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from bps_utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470423137.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [2]:
os.chdir('../data/')

# Compare  Indonesian political speeches

## Get President Sukarno's speeches
* Bandung Conference  
  18th April 1955  
  Bandung, Indonesia
  
  
* International Meeting  
  19th December 1961    
  Jogjakarta, Indonesia
  

### Bandung speech

In [3]:
get_Sukarno_Bandung_speech()

('sukarno_bandung_speech.pdf', 'already exists')


* Convert the .pdf of speech to text

In [4]:
sukarno_bandung_speech = convert_pdf_to_txt('sukarno_bandung_speech.pdf')

* subset the speech to remove citation material in document heading

In [5]:
sukarno_bandung_speech = sukarno_bandung_speech[1167:len(sukarno_bandung_speech)]
sukarno_bandung_speech[0:829]

'Your Excellencies,Ladies and Gentlemen, Sisters and Brothers.It is my great honour and privilege on this historic day to bid you welcome to Indonesia. On behalf of the people and government of Indonesia - your hosts - I beg your understanding and forbearance if some circumstances in our country do not meet your expectation. We have, I assure you, done our best to make your stay amongst us memorable for both our guests and your hosts. We hope that the warmth of our welcome will compensate for whatever material shortcomings there may be.As I survey this hall and the distinguished guests gathered here, my heart is filled with emotion. This is the first intercontinental conference of coloured peoples in the history of mankind! I am proud that my country is your host. I am happy that you were able to accept the invitations'

### Jogykarta speech 

In [6]:
sukarno_jogyakarta_url = 'http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html'
sukarno_jogyakarta_url_requests = requests.get(sukarno_jogyakarta_url)
sukarno_jogyakarta_url_html_content = sukarno_jogyakarta_url_requests.text
sukarno_jogyakarta_url_soup = BeautifulSoup(sukarno_jogyakarta_url_html_content, 'lxml')
sukarno_jogyakarta_speech_text = sukarno_jogyakarta_url_soup.get_text().encode('utf8').replace('\n\n','')
sukarno_jogyakarta_speech_unicode = UnicodeDammit(sukarno_jogyakarta_speech_text, ["ascii"])
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech_unicode.unicode_markup.encode('ascii', 'ignore').decode('ascii')
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech.replace("\n","").replace("\r", "")


* subset the speech to remove citation material in document heading and ending

In [7]:
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech[693:len(sukarno_jogyakarta_speech)-58]
sukarno_jogyakarta_speech[0:100]

u'As was said by the Sultan just now, today, it is exactly 15 yearssince the day on which the city of '

## President Jokowi speeches

* Inauguration  
  20th October 20  
  Jakarta, Indonesia

In [8]:
jokowi_inauguration_url = 'http://kbriseoul.kr/kbriseoul/index.php/en/2013-01-13-22-22-09/embassy-news/308-jokowi-inauguration-speech.io'
jokowi_inauguration_request = requests.get(jokowi_inauguration_url)
jokowi_inauguration_html_content = jokowi_inauguration_request.content
jokowi_inauguration_url_soup = BeautifulSoup(jokowi_inauguration_html_content, 'lxml')
jokowi_inauguration_speech_text = jokowi_inauguration_url_soup.get_text().encode('utf8').replace('\n\n','')
jokowi_inauguration_speech_unicode = UnicodeDammit(jokowi_inauguration_speech_text, ["ascii"])
jokowi_inauguration_speech = jokowi_inauguration_speech_unicode.unicode_markup.encode('ascii', 'ignore').decode('ascii')
jokowi_inauguration_speech = jokowi_inauguration_speech.replace("\n","")


* subset the speech to remove citation material in document heading and ending

In [9]:
jokowi_inauguration_speech = jokowi_inauguration_speech[1312:len(jokowi_inauguration_speech)-30]
jokowi_inauguration_speech[0:100]

u'Assalamualaikum Warahmatullahi Wabarakatuh,Peace be upon us.Om Swastiastu, Namo Buddhaya  Honorable '

* Convert the text to a pandas dataframe 
* easy to transform pandas dataframe to graphlab sframe
* for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [10]:
speech_data = {'speech': [sukarno_bandung_speech, sukarno_jogyakarta_speech, jokowi_inauguration_speech]}
index= [0,1,2]
speech_df = pd.DataFrame(data = speech_data, index = index)

In [11]:
speech_df

Unnamed: 0,speech
0,"Your Excellencies,Ladies and Gentlemen, Sister..."
1,"As was said by the Sultan just now, today, it ..."
2,"Assalamualaikum Warahmatullahi Wabarakatuh,Pea..."


In [12]:
BASE_DIR = "~/repos/statistics-indonesia-python/text_analysis/data" 

## Bag-of-words

* each document is represented by a map where the words are keys and the values are the number of occurrences.
* use pandas data frame to create graphlab sframe
* save sframe 
* transform sframe to bag-of-words (bow) model

In [13]:
sf = gl.SFrame(data=speech_df)
sf.save("%s/indonesian_speeches.sframe" % BASE_DIR)

In [14]:
bow = gl.text_analytics.count_words(sf['speech'])
sf['bow'] = bow
bow

dtype: dict
Rows: 3

## TF-IDF

* Another useful representation for text data is called TF-IDF (term frequency - inverse document frequency).
* This is a modification of the bag-of-words format where the counts are transformed into scores: 
    * words that are common across the document corpus are given low scores,  
    rare words occurring often in a document are given high scores

* TF-IDF(word,document)=N(word,document)∗log(1/∑dN(word,d)))
    * where N(w, d) is the number of times word w occurs in document d

In [15]:
sf['tfidf'] = gl.text_analytics.tf_idf(sf['bow'])
sf['tfidf']

dtype: dict
Rows: 3

## Text cleaning

* remove all words do not occur at least twice in each document using

In [16]:
docs = sf['bow'].dict_trim_by_values(2)

* GraphLab Create also contains a helper function called stopwords that returns a list of common words.
We can use SArray.docs.dict_trim_by_keys to remove these words from the documents as a preprocessing step.
NB: Currently only English words are available.

In [17]:
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

In [18]:
docs[0]

{'"live': 4,
 '-': 29,
 '/': 6,
 '21/09/2015': 6,
 '8': 7,
 'achieved': 2,
 'act': 4,
 'affairs': 3,
 'africa': 13,
 'africa,': 3,
 'african': 6,
 'ago': 5,
 'ago,': 2,
 'aims': 2,
 'alien': 2,
 'allowed': 2,
 'and,': 2,
 'answer': 2,
 'anti-colonial': 2,
 'appreciation': 2,
 'asia': 21,
 'asian': 10,
 'asian-african': 7,
 'assembled': 2,
 'attained': 2,
 'battle': 3,
 'bear': 3,
 'beg': 3,
 'behalf': 2,
 'beliefs,': 3,
 'bid': 2,
 'birthplaces': 2,
 'bitter': 2,
 'blessing': 2,
 'bombs,': 2,
 'bonds': 4,
 'brothers,': 6,
 'called': 4,
 'cannot,': 2,
 'causes,': 2,
 'changing': 2,
 'children': 2,
 'classic': 2,
 'clear': 3,
 'code': 2,
 'colonialism': 8,
 'colonialism.': 2,
 'comfort': 3,
 'common': 9,
 'completely': 3,
 'concern': 2,
 'conference': 22,
 'conference,': 3,
 'conflict': 2,
 'content': 3,
 'continents': 5,
 'continents.': 3,
 'control': 4,
 'control,': 2,
 'countries': 11,
 'countries.': 2,
 'country': 5,
 'courage': 3,
 'cultural': 2,
 'danger': 2,
 'day': 2,
 'days': 2,

* the tokenizer transforms each row into an ordered list of strings that represents the a simpler version of the Penn-Tree-Bank-style (PTB-style) tokenization of that row's document. 
* The representation of a document provided by PTB-style of tokenization is essential for sequence-tagging, parsing, bag-of-words treatment, and any text analytics task that requires word-level granularity. For a description of this style of tokenization, see [tokenization example](https://www.cis.upenn.edu/~treebank/tokenization.html.)

In [19]:
tokenized_speech = gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['speech'])
tokenized_speech

tokens
"[Your, Excellencies,Ladies, ..."
"[As, was, said, by, the, Sultan, just, now,, ..."
"[Assalamualaikum, Warahmatullahi, ..."


## Part of Speech Extraction

* Highlight unique nouns in your text, identify adjectives with the high sentiment scores, or pull out nouns to generate candidate entities. 
The extract_parts_of_speech method parses the text in each element and extracts the words that are a given part of speech. 
For instance, to find all instances of adjectives:

In [20]:
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['speech'],
                                                  chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech

adjectives
"{'ADJ': {'exclusive': 1, 'all': 7, 'manifold': 1, ..."
"{'ADJ': {'all': 4, 'true': 2, 'done?Oh': 1, ..."
"{'ADJ': {'spiritual': 1, 'own': 2, ..."


## Sentence Splitting

* The sentence splitter splits by sentence and outputs a list of sentences. 
This aids in anlysis at the sentence level.
For example, you may want a sentiment score for each sentence in a document.
The following command accomplishes this for you:

In [21]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['speech'])
sentences

sent
"[Your Excellencies,Ladies and Gentlemen, Sisters ..."
"[As was said by the Sultan just now, today, ..."
[Assalamualaikum Warahmatullahi ...


## Create a topic model

In [22]:
docs = gl.text_analytics.count_words(sf['speech'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

* Learn topic model

In [23]:
model = gl.topic_model.create(docs)

In [24]:
print model.get_topics()

+-------+-----------+-----------------+
| topic |    word   |      score      |
+-------+-----------+-----------------+
|   0   |  struggle | 0.0256988277728 |
|   0   |    made   | 0.0196874060715 |
|   0   |   asian   | 0.0151788397956 |
|   0   |     8     | 0.0106702735197 |
|   0   |  foreign  | 0.0106702735197 |
|   1   |  republic | 0.0495191057967 |
|   1   |  command  |  0.018325968287 |
|   1   |   armed   | 0.0157265401612 |
|   1   | president | 0.0157265401612 |
|   1   |   nation  | 0.0144268260983 |
+-------+-----------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [25]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [struggle, made, asian, 8,... |
| [republic, command, presid... |
| [-, indonesia,, peoples, a... |
| [independence, hope, parts... |
| [people, dutch, nations, i... |
| [spirit, national, common,... |
| [conference, world, africa... |
| [west, irian, indonesian, ... |
| [great, flag, colonialism,... |
| [united, yes,, red, contin... |
+-------------------------------+
[10 rows x 1 columns]



In [26]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 2674

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0203
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

* To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:


In [27]:
pred = model.predict(docs)

* Combining the above method with standard SFrame capabilities, one can use predict to find documents related to a particular topic


In [28]:
docs_in_topic_0 = docs[model.predict(docs) == 0]

In [29]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 3
[array('d', [0.12631578947368421, 0.05308924485125858, 0.13135011441647598, 0.140045766590389, 0.08787185354691075, 0.08466819221967964, 0.15606407322654461, 0.0517162471395881, 0.12631578947368421, 0.042562929061784896]), array('d', [0.04678362573099415, 0.13032581453634084, 0.08897243107769423, 0.04218880534670008, 0.13909774436090225, 0.05889724310776942, 0.0948203842940685, 0.18671679197994986, 0.05639097744360902, 0.1558061821219716]), array('d', [0.05421686746987952, 0.20481927710843373, 0.09437751004016064, 0.03413654618473896, 0.1144578313253012, 0.21285140562248997, 0.07228915662650602, 0.11244979919678715, 0.07429718875502007, 0.02610441767068273])]

In [30]:
model['vocabulary']

dtype: str
Rows: 2674
['!', 'speed', 'nations!bismillah', 'opened,', 'declare', 'million', 'president', 'africans', 'asians', 'majority.and', "humanity's", 'degradation,', 'liberation', 'solong', 'diminished', 'blessing', 'remember', 'future.', 'circumstances.let', 'flints', 'strike', 'deliberations', 'hard.', 'easy.', '"to', 'sons:', 'attain', 'safeguarded', 'evidence', 'afternoon-tea', 'abroad:', 'lie', 'falsify', 'achieve.', 'worthwhile,', 'happen,', 'problems', "other's", 'understand', 'making', 'other.if', "others'", 'profit', 'roots.', 'experience,', 'neighbours.', 'confidence.', 'warm', 'unfriendly', 'born', 'looked', 'world.failure', 'presence', 'likelihood', 'pillar', 'welfare', 'effect', 'harmony,', 'way,in', 'hold', 'discussion,ways', 'friendly,', 'brings', 'diversity"', 'nation.so,', 'motto', 'god,', 'toradjas,', 'bhinneka', 'madurese,', 'bataks,', 'achenese,', 'units,', 'ethnic', 'moreover,', 'christians,', 'wehave', 'acting', 'lives,', 'large', 'hardwon', 'bulwark', 'sour

In [31]:
model['topics']

topic_probabilities,vocabulary
"[0.000150285542531, 0.0014296854692, ...",!
"[0.000150285542531, 0.000129971406291, ...",speed
"[0.00165314096784, 0.000129971406291, ...",nations!bismillah
"[0.000150285542531, 0.000129971406291, ...","opened,"
"[0.000150285542531, 0.000129971406291, ...",declare
"[0.000150285542531, 0.000129971406291, ...",million
"[0.000150285542531, 0.0157265401612, ...",president
"[0.000150285542531, 0.000129971406291, ...",africans
"[0.000150285542531, 0.000129971406291, ...",asians
"[0.000150285542531, 0.000129971406291, ...",majority.and


* save and load models

In [32]:
os.chdir("../text_analysis/data/")

In [33]:
model.save('indonesian_speeches_model')
sukarno_model = gl.load_model('indonesian_speeches_model')