In [1]:
import os
import re
import pdfkit
import urllib2
import pandas as pd
import graphlab as gl
import requests
from bs4 import BeautifulSoup
from bps_utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470084091.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [2]:
os.chdir('../data/')

## Get President Sukarno's Jogjakarta Speech
* [19th December 1961](http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html)  
  Jogjakarta, Indonesia

* Chris Albon tutorial on [beautiful soup](http://chrisalbon.com/python/beautiful_soup_html_basics.html)

In [3]:
# Create a variable with the url
url = 'http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html'

# Use requests to get the contents
r = requests.get(url)

# Get the text of the contents
html_content = r.text

# Convert the html content into a beautiful soup object
soup = BeautifulSoup(html_content, 'lxml')

In [4]:
sukarno_jogyakarta_speech = soup.get_text().replace('\n\n','')
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech.replace('\r\n', " ")

* subset the speech to remove citation material in document heading and ending

In [5]:
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech[708:len(sukarno_jogyakarta_speech)-66]

* save the speech to a text file 

In [6]:
f = open('sukarno_jogjakarta_speech.txt', 'w')
f.write(sukarno_jogyakarta_speech.encode('utf8'))
f.close()

In [7]:
#need wkhtmltox to work
#pdfkit.from_url('http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html', 'sukarno_jogjakarta_speech.pdf')

* Convert the text to a pandas dataframe 
* easy to transform pandas dataframe to graphlab sframe
* for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [8]:
speech_data = {'speech': sukarno_jogyakarta_speech}
index= [1]
speech_df = pd.DataFrame(data = speech_data, index = index)

In [9]:
BASE_DIR = "~/repos/statistics-indonesia-python/text_analysis/data" 

## Bag-of-words

* each document is represented by a map where the words are keys and the values are the number of occurrences.
* use pandas data frame to create graphlab sframe
* save sframe 
* transform sframe to bag-of-words (bow) model

In [10]:
sf = gl.SFrame(data=speech_df)
sf.save("%s/sukarno_jogyakarta.sframe" % BASE_DIR)

In [11]:
bow = gl.text_analytics.count_words(sf['speech'])
sf['bow'] = bow
bow

dtype: dict
Rows: 1
[{'all': 28, 'ever.': 1, 'arrested.': 1, 'people': 44, 'resistance': 1, 'unfurl': 6, 'colonising.': 2, 'fields.': 1, 'ambassadors,': 1, 'go': 1, 'religious': 1, 'tricks,': 1, '21st': 4, '92': 1, 'graves': 1, 'imperialism.': 6, '[11]': 1, 'colonisation,': 1, 'including': 2, '1945': 4, 'proposal.': 2, "mook's": 1, 'colonisation".': 1, 'those': 8, 'unfurled': 1, 'under': 14, 'jogjakarta,': 3, 'sovereignty': 3, 'pleasing': 1, 'friendly': 2, 'carried': 2, 'decision.': 1, 'repre-': 1, 'returned': 3, 'and,': 5, 'far': 1, 'came,': 1, 'dutch': 39, '"tour': 1, 'responsibility,': 1, 'me': 6, 'cemeteries': 1, 'putting': 3, 'accuse': 1, 'asia,': 2, 'attractive.': 1, 'yourselves': 1, 'more.': 1, 'today,': 3, 'islands.': 1, 'decoio': 1, 'established,': 1, 'merauke.': 1, 'entire': 8, 'merauke,': 1, 'successful.': 1, '17th': 6, '[9]': 1, 'called': 9, 'did': 5, 'die': 1, 'proposals': 1, 'make': 2, 'bay.': 1, 'these': 6, 'archipelago': 2, 'hague': 1, 'independence': 6, 'arrested': 1, 

## TF-IDF

* Another useful representation for text data is called TF-IDF (term frequency - inverse document frequency).
* This is a modification of the bag-of-words format where the counts are transformed into scores: 
    * words that are common across the document corpus are given low scores,  
    rare words occurring often in a document are given high scores

* TF-IDF(word,document)=N(word,document)∗log(1/∑dN(word,d)))
    * where N(w, d) is the number of times word w occurs in document d

In [12]:
sf['tfidf'] = gl.text_analytics.tf_idf(sf['bow'])
sf['tfidf']

dtype: dict
Rows: 1
[{'all': 0.0, 'ever.': 0.0, 'arrested.': 0.0, 'people': 0.0, 'resistance': 0.0, 'unfurl': 0.0, 'colonising.': 0.0, 'fields.': 0.0, 'ambassadors,': 0.0, 'go': 0.0, 'religious': 0.0, 'tricks,': 0.0, '21st': 0.0, '92': 0.0, 'graves': 0.0, 'imperialism.': 0.0, '[11]': 0.0, 'colonisation,': 0.0, 'including': 0.0, '1945': 0.0, 'proposal.': 0.0, "mook's": 0.0, 'colonisation".': 0.0, 'those': 0.0, 'unfurled': 0.0, 'under': 0.0, 'jogjakarta,': 0.0, 'sovereignty': 0.0, 'pleasing': 0.0, 'friendly': 0.0, 'carried': 0.0, 'decision.': 0.0, 'repre-': 0.0, 'returned': 0.0, 'and,': 0.0, 'far': 0.0, 'came,': 0.0, 'dutch': 0.0, '"tour': 0.0, 'responsibility,': 0.0, 'me': 0.0, 'cemeteries': 0.0, 'putting': 0.0, 'accuse': 0.0, 'asia,': 0.0, 'attractive.': 0.0, 'yourselves': 0.0, 'more.': 0.0, 'today,': 0.0, 'islands.': 0.0, 'decoio': 0.0, 'established,': 0.0, 'merauke.': 0.0, 'entire': 0.0, 'merauke,': 0.0, 'successful.': 0.0, '17th': 0.0, '[9]': 0.0, 'called': 0.0, 'did': 0.0, 'die': 0

## Text cleaning

* remove all words do not occur at least twice in each document using

In [13]:
docs = sf['bow'].dict_trim_by_values(2)

* GraphLab Create also contains a helper function called stopwords that returns a list of common words.
We can use SArray.docs.dict_trim_by_keys to remove these words from the documents as a preprocessing step.
NB: Currently only English words are available.

In [14]:
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

In [15]:
docs[0]

{'"papua': 5,
 '"state': 7,
 '(the': 2,
 '-': 17,
 '16': 2,
 '17th': 6,
 '1945': 4,
 '1947,': 3,
 '1948,': 2,
 '1948.': 2,
 '1949,': 4,
 '1961.': 2,
 '19th': 4,
 '21st': 4,
 '27th': 6,
 ':': 3,
 '?': 3,
 'academy,': 2,
 'accompanied': 2,
 'act.': 2,
 'action': 7,
 'actions': 2,
 'again.': 2,
 'ago': 3,
 'agreed': 2,
 'agung': 2,
 'all,': 2,
 'almighty': 3,
 'alone.': 2,
 'ambassadors': 2,
 'and,': 5,
 'anthem".': 3,
 'archipelago': 2,
 'argument': 2,
 'armed': 11,
 'army': 2,
 'asia,': 2,
 'association': 2,
 'attention': 2,
 'august': 5,
 'authority': 10,
 'authority.': 3,
 'before.': 3,
 'began': 2,
 'beginning': 2,
 'begun': 2,
 'bothered': 3,
 'bring': 10,
 'but,': 3,
 'call': 3,
 'called': 9,
 'carried': 2,
 'carry': 2,
 'carrying': 6,
 'change': 2,
 'chief': 3,
 'chiefs': 2,
 'city': 2,
 'colonisation.': 3,
 'colonising.': 2,
 'command': 15,
 'command,': 2,
 'command.': 3,
 'command?': 2,
 'commander': 5,
 'common': 3,
 'completely': 2,
 'conference': 3,
 'conference,': 3,
 'confi

* the tokenizer transforms each row into an ordered list of strings that represents the a simpler version of the Penn-Tree-Bank-style (PTB-style) tokenization of that row's document. 
* The representation of a document provided by PTB-style of tokenization is essential for sequence-tagging, parsing, bag-of-words treatment, and any text analytics task that requires word-level granularity. For a description of this style of tokenization, see [tokenization example](https://www.cis.upenn.edu/~treebank/tokenization.html.)

In [16]:
tokenized_speech = gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['speech'])
tokenized_speech

tokens
"[Friends, As, was, said, by, the, Sultan, just, ..."


## Part of Speech Extraction

* Highlight unique nouns in your text, identify adjectives with the high sentiment scores, or pull out nouns to generate candidate entities. 
The extract_parts_of_speech method parses the text in each element and extracts the words that are a given part of speech. 
For instance, to find all instances of adjectives:

In [17]:
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['speech'],
                                                  chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech

adjectives
"{'ADJ': {'all': 5, 'heartfelt': 1, 'human': ..."


## Sentence Splitting

* The sentence splitter splits by sentence and outputs a list of sentences. 
This aids in anlysis at the sentence level.
For example, you may want a sentiment score for each sentence in a document.
The following command accomplishes this for you:

In [18]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['speech'])
sentences

sent
"[Friends As was said by the Sultan just now, ..."


## Create a topic model

In [19]:
docs = gl.text_analytics.count_words(sf['speech'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

* Learn topic model

In [20]:
model = gl.topic_model.create(docs)

In [21]:
print model.get_topics()

+-------+----------+-----------------+
| topic |   word   |      score      |
+-------+----------+-----------------+
|   0   | command  | 0.0513430805848 |
|   0   | friends, | 0.0513430805848 |
|   0   | struggle | 0.0411424685481 |
|   0   |  irian.  | 0.0275416524991 |
|   0   |   fact   | 0.0275416524991 |
|   1   |    -     | 0.0508777149658 |
|   1   |  spirit  |  0.047902409997 |
|   1   |  called  | 0.0270752752157 |
|   1   |  said:   | 0.0181493603094 |
|   1   | people.  | 0.0181493603094 |
+-------+----------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [22]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [command, friends,, strugg... |
| [-, spirit, called, people... |
| [order, fire, imperialism.... |
| [republic, irian, indonesi... |
| [indonesian, armed, white,... |
| [indonesia., united, bring... |
| [dutch, now,, indonesia,, ... |
| [west, people, territory, ... |
| [socialist, people,, state... |
| [yes,, flying, un,, nation... |
+-------------------------------+
[10 rows x 1 columns]



In [23]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 1051

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0156
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

* To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:


In [24]:
pred = model.predict(docs)

* Combining the above method with standard SFrame capabilities, one can use predict to find documents related to a particular topic


In [25]:
docs_in_topic_0 = docs[model.predict(docs) == 0]

In [26]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 1
[array('d', [0.06084115965700286, 0.10085749285422621, 0.0726827276439363, 0.13679052674561046, 0.10208248264597795, 0.08125765618619844, 0.13352388730093917, 0.1416904859126174, 0.07186606778276848, 0.09840751327072274])]

In [27]:
model['vocabulary']

dtype: str
Rows: 1051
['3.', 'colonialism', 'perform', 'context', '[15]', 'read', 'executed', 'sign', 'prepare', 'cemeteries', 'adorn', "hero's", 'lie', 'killed', 'monginsidi', 'movement.', 'hasannudin.', 'untung', 'djoko', 'teuku', 'bondjol.', 'imam', 'power', "coen's", 'hanjokrokusumo,', 'i,', 'fight', 'loves', 'hearts', 'continuing', '[14]', 'commander', "sukarno's", 'support', 'word', 'desire', 'expression', 'supreme', 'giving', 'groups', 'workers,', 'groups.', 'religious', 'land', 'khaki', 'me.', 'republic?', '[sic]', 'urges', 'enjoy', 'actions,', 'briefly,', 'die.', 'speculating,', 'prayed', 'hope', 'already.', 'sukarno.', 'speaking.', 'onward!', '2,000', 'helping', 'africa,', 'peoples,', 'million', 'indonesia!', 'repeatedly,', 'onwards.', '[13]', 'make.', 'friends.', 'oppose', 'demanding', 'carry', 'trip', 'unitary', 'establish', 'dies.', 'ed.)', 'ditiro.', 'imperialists,', 'vindicate', 'they,', 'side?', 'thankful', 'wrong', 'intent', 'neutral', 'policy.', 'middle,', '1.', 'imag

In [28]:
model['topics']

topic_probabilities,vocabulary
"[0.000340020401224, 0.000297530496876, ...",3.
"[0.000340020401224, 0.000297530496876, ...",colonialism
"[0.000340020401224, 0.000297530496876, ...",perform
"[0.000340020401224, 0.000297530496876, ...",context
"[0.000340020401224, 0.000297530496876, ...",[15]
"[0.000340020401224, 0.000297530496876, ...",read
"[0.000340020401224, 0.000297530496876, ...",executed
"[0.000340020401224, 0.000297530496876, ...",sign
"[0.000340020401224, 0.000297530496876, ...",prepare
"[0.000340020401224, 0.000297530496876, ...",cemeteries


* save and load models

In [29]:
os.chdir("../text_analysis/data/")

In [30]:
model.save('sukarno_jogyakarta_model')
sukarno_model = gl.load_model('sukarno_jogyakarta_model')