In [1]:
import os
import re
import pdfkit
import urllib2
import pandas as pd
import graphlab as gl
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1471495354.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [2]:
os.chdir('../data/')

## Get President Suharto's Resignation Speech
* [21 May 21 1998](http://partners.nytimes.com/library/world/asia/052198indonesia-suharto-text.html)  
  Merdeka Palace
  Jakarta, Indonesia
  

* Chris Albon tutorial on [beautiful soup](http://chrisalbon.com/python/beautiful_soup_html_basics.html)

In [3]:
suharto_resignation_url = 'http://partners.nytimes.com/library/world/asia/052198indonesia-suharto-text.html'
suharto_resignation_url_requests = requests.get(suharto_resignation_url)
suharto_resignation_url_html_content = suharto_resignation_url_requests.text
suharto_resignation_url_soup = BeautifulSoup(suharto_resignation_url_html_content, 'lxml')
suharto_resignation_speech_text = suharto_resignation_url_soup.get_text().encode('utf8').replace('\n\n','')
suharto_resignation_speech_unicode = UnicodeDammit(suharto_resignation_speech_text, ["ascii"])
suharto_resignation_speech = suharto_resignation_speech_unicode.unicode_markup.encode('ascii', 'ignore').decode('ascii')
suharto_resignation_speech = suharto_resignation_speech.replace("\n","").replace("\r", "")

* subset the speech to remove citation material in document heading and ending

In [4]:
suharto_resignation_speech = suharto_resignation_speech[247:len(suharto_resignation_speech)-400]
suharto_resignation_speech[0:100]

u'In the name of God the All-Mighty,   Fellow members of the nation and the motherland,   Assalamualai'

* save the speech to a text file 

In [5]:
f = open('suharto_resignation_speech.txt', 'w')
f.write(suharto_resignation_speech.encode('utf8'))
f.close()

In [6]:
#need wkhtmltox to work
#pdfkit.from_url('http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html', 'sukarno_jogjakarta_speech.pdf')

* Convert the text to a pandas dataframe 
* easy to transform pandas dataframe to graphlab sframe
* for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [7]:
speech_data = {'speech': suharto_resignation_speech}
index= [1]
speech_df = pd.DataFrame(data = speech_data, index = index)

In [8]:
BASE_DIR = "~/repos/statistics-indonesia-python/text_analysis/data" 

## Bag-of-words

* each document is represented by a map where the words are keys and the values are the number of occurrences.
* use pandas data frame to create graphlab sframe
* save sframe 
* transform sframe to bag-of-words (bow) model

In [9]:
sf = gl.SFrame(data=speech_df)
sf.save("%s/suharto_resignation.sframe" % BASE_DIR)

In [10]:
bow = gl.text_analytics.count_words(sf['speech'])
sf['bow'] = bow
bow

dtype: dict
Rows: 1
[{'indonesia,': 3, 'development,': 2, 'all': 1, 'development.': 1, 'vacuum': 1, 'representatives': 2, 'not': 1, 'mistakes': 1, 'thanks': 1, 'stepping': 1, 'carefully': 1, '21,': 1, 'before': 1, 'but,': 1, 'based': 1, 'also': 1, 'with': 4, 'possible,': 1, 'day,': 1, 'should': 1, 'to': 12, 'maintaining': 1, '8': 2, 'earnestly': 1, 'has': 1, 'into': 1, 'committee': 2, 'continuity': 1, 'pancasila': 1, 'it,': 1, 'his': 1, 'views': 1, 'read': 1, 'express': 2, 'professor,': 1, 'cannot': 1, 'meeting.': 1, 'front': 1, 'during': 2, 'now': 1, 'day': 1, 'presidential': 1, '1998-2003.': 1, 'name': 1, 'ceased': 1, 'materialized': 1, 'this': 4, 'indonesian': 1, 'situation,': 1, 'wish': 1, 'b.j.': 1, 'prevent': 1, 'declared': 1, 'because': 2, 'prompted': 1, 'people': 2, 'national': 2, 'constitution.': 1, 'therefore,': 1, 'decided': 1, 'are': 2, 'our': 2, 'term,': 1, 'constitution,': 1, 'best': 1, 'leaders': 2, 'shown': 1, 'necessary.': 1, 'constitution': 1, 'for': 8, 'reform': 2, '

## TF-IDF

* Another useful representation for text data is called TF-IDF (term frequency - inverse document frequency).
* This is a modification of the bag-of-words format where the counts are transformed into scores: 
    * words that are common across the document corpus are given low scores,  
    rare words occurring often in a document are given high scores

* TF-IDF(word,document)=N(word,document)∗log(1/∑dN(word,d)))
    * where N(w, d) is the number of times word w occurs in document d

In [11]:
sf['tfidf'] = gl.text_analytics.tf_idf(sf['bow'])
sf['tfidf']

dtype: dict
Rows: 1
[{'indonesia,': 0.0, 'development,': 0.0, 'all': 0.0, 'development.': 0.0, 'vacuum': 0.0, 'representatives': 0.0, 'not': 0.0, 'mistakes': 0.0, 'thanks': 0.0, 'stepping': 0.0, 'carefully': 0.0, '21,': 0.0, 'before': 0.0, 'but,': 0.0, 'based': 0.0, 'also': 0.0, 'with': 0.0, 'possible,': 0.0, 'day,': 0.0, 'should': 0.0, 'to': 0.0, 'maintaining': 0.0, '8': 0.0, 'earnestly': 0.0, 'has': 0.0, 'into': 0.0, 'committee': 0.0, 'continuity': 0.0, 'pancasila': 0.0, 'it,': 0.0, 'his': 0.0, 'views': 0.0, 'read': 0.0, 'express': 0.0, 'professor,': 0.0, 'cannot': 0.0, 'meeting.': 0.0, 'front': 0.0, 'during': 0.0, 'now': 0.0, 'day': 0.0, 'presidential': 0.0, '1998-2003.': 0.0, 'name': 0.0, 'ceased': 0.0, 'materialized': 0.0, 'this': 0.0, 'indonesian': 0.0, 'situation,': 0.0, 'wish': 0.0, 'b.j.': 0.0, 'prevent': 0.0, 'declared': 0.0, 'because': 0.0, 'prompted': 0.0, 'people': 0.0, 'national': 0.0, 'constitution.': 0.0, 'therefore,': 0.0, 'decided': 0.0, 'are': 0.0, 'our': 0.0, 'term,

## Text cleaning

* remove all words do not occur at least twice in each document using

In [12]:
docs = sf['bow'].dict_trim_by_values(2)

* GraphLab Create also contains a helper function called stopwords that returns a list of common words.
We can use SArray.docs.dict_trim_by_keys to remove these words from the documents as a preprocessing step.
NB: Currently only English words are available.

In [13]:
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

In [14]:
docs[0]

{'1945': 3,
 '8': 2,
 'article': 2,
 'aspirations': 2,
 'cabinet': 2,
 'committee': 2,
 'composition': 2,
 'council': 2,
 'development': 4,
 'development,': 2,
 'express': 2,
 'form': 2,
 'governing': 2,
 'implementing': 2,
 'indonesia': 2,
 'indonesia,': 3,
 'leaders': 2,
 'leadership': 3,
 'line': 2,
 'manner': 2,
 'nation': 4,
 'national': 2,
 'oath': 2,
 'people': 2,
 "people's": 4,
 'plan': 2,
 'president': 4,
 'reform': 2,
 'reforms': 3,
 'representatives': 2,
 'republic': 4,
 'seventh': 3,
 'state': 2,
 'taking': 2,
 'vice': 2}

* the tokenizer transforms each row into an ordered list of strings that represents the a simpler version of the Penn-Tree-Bank-style (PTB-style) tokenization of that row's document. 
* The representation of a document provided by PTB-style of tokenization is essential for sequence-tagging, parsing, bag-of-words treatment, and any text analytics task that requires word-level granularity. For a description of this style of tokenization, see [tokenization example](https://www.cis.upenn.edu/~treebank/tokenization.html.)

In [15]:
tokenized_speech = gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['speech'])
tokenized_speech

tokens
"[In, the, name, of, God, the, All-Mighty,, Fel ..."


## Part of Speech Extraction

* Highlight unique nouns in your text, identify adjectives with the high sentiment scores, or pull out nouns to generate candidate entities. 
The extract_parts_of_speech method parses the text in each element and extracts the words that are a given part of speech. 
For instance, to find all instances of adjectives:

In [16]:
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['speech'],
                                                  chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech

adjectives
"{'ADJ': {'national': 2, 'deep': 1, 'good': 1, ..."


## Sentence Splitting

* The sentence splitter splits by sentence and outputs a list of sentences. 
This aids in anlysis at the sentence level.
For example, you may want a sentiment score for each sentence in a document.
The following command accomplishes this for you:

In [17]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['speech'])
sentences

sent
"[In the name of God the All-Mighty, Fellow ..."


## Create a topic model

In [18]:
docs = gl.text_analytics.count_words(sf['speech'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

* Learn topic model

In [19]:
model = gl.topic_model.create(docs)

In [20]:
print model.get_topics()

+-------+------------+-----------------+
| topic |    word    |      score      |
+-------+------------+-----------------+
|   0   |    line    | 0.0677419354839 |
|   0   |  conveyed  | 0.0354838709677 |
|   0   |  council,  | 0.0354838709677 |
|   0   |   state.   | 0.0354838709677 |
|   0   |   state,   | 0.0354838709677 |
|   1   | leadership | 0.0939393939394 |
|   1   |  national  | 0.0636363636364 |
|   1   |    vice    | 0.0636363636364 |
|   1   |  declare   | 0.0333333333333 |
|   1   |  adequate  | 0.0333333333333 |
+-------+------------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [21]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [line, conveyed, state., s... |
| [leadership, national, vic... |
| [people's, reforms, tradit... |
| [president, nation, aspira... |
| [development, republic, 19... |
| [governing, prevent, assis... |
| [seventh, 8, state, leader... |
| [oath, article, cabinet, (... |
| [indonesia,, form, indones... |
| [composition, god, post, d... |
+-------------------------------+
[10 rows x 1 columns]



In [22]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 160

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0089
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

* To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:


In [23]:
pred = model.predict(docs)

* Combining the above method with standard SFrame capabilities, one can use predict to find documents related to a particular topic


In [24]:
docs_in_topic_0 = docs[model.predict(docs) == 0]

In [25]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 1
[array('d', [0.1076923076923077, 0.10384615384615385, 0.08461538461538462, 0.1346153846153846, 0.16923076923076924, 0.07307692307692308, 0.08846153846153847, 0.08461538461538462, 0.07692307692307693, 0.07692307692307693])]

In [26]:
model['vocabulary']

dtype: str
Rows: 160
['court', 'prevent', 'council,', 'front', 'conditions', 'thanks.', 'ministers', 'outgoing', 'too,', 'pancasila', 'remain', 'shortcomings.', 'plan', 'national', 'forgiveness', 'led', 'support', 'representative', 'mpr,', 'holder', 'term,', 'recent', 'consideration', 'engineer', '1998-2003.', 'doctor,', 'professor,', 'assembly,', 'consultative', 'you,', 'indonesian', 'remainder', '1998.', '21,', 'opinion', 'presidential', 'statement,', 'republic', 'thursday,', 'day', 'read', 'time', 'indonesia', 'president', 'ceased', 'it,', "people's", 'views', 'earnestly', 'continuity', 'constitution', 'line', 'state', 'constitution,', 'indonesia,', 'taking', 'duties', 'good', 'implement', 'cabinet.', 'cabinet', 'difficult', 'meeting.', 'leadership', 'representatives', 'prompted', 'longer', 'manner,', 'reality', 'life', 'necessary.', 'habibie', 'vacuum', 'aspirations', 'council', 'forming', 'faced', 'implementing', 'stepping', 'materialized', 'impossibility', 'shown', 'supreme', 'gr

In [27]:
model['topics']

topic_probabilities,vocabulary
"[0.00322580645161, 0.0030303030303, ...",court
"[0.00322580645161, 0.0030303030303, ...",prevent
"[0.0354838709677, 0.0030303030303, ...","council,"
"[0.00322580645161, 0.0030303030303, ...",front
"[0.00322580645161, 0.0030303030303, ...",conditions
"[0.00322580645161, 0.0030303030303, ...",thanks.
"[0.0354838709677, 0.0030303030303, ...",ministers
"[0.00322580645161, 0.0030303030303, ...",outgoing
"[0.00322580645161, 0.0333333333333, ...","too,"
"[0.00322580645161, 0.0030303030303, ...",pancasila


* save and load models

In [28]:
os.chdir("../text_analysis/data/")

In [29]:
model.save('suharto_resignation_model')
sukarno_model = gl.load_model('suharto_resignation_model')