In [1]:
import os
import re
import spacy
import urllib2
import requests
import pandas as pd
import graphlab as gl
from bs4 import BeautifulSoup
from utils import get_Sukarno_Bandung_speech, convert_pdf_to_txt

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1503275429.log


This non-commercial license of GraphLab Create for academic use is assigned to kivan.polimis@gmail.com and will expire on August 20, 2018.


In [2]:
os.chdir('../data/')

## Get President Sukarno's speeches
* Bandung Conference  
  18th April 1955  
  Bandung, Indonesia
  
  
* International Meeting  
  19th December 1961    
  Jogjakarta, Indonesia
  

### Bandung speech

In [3]:
get_Sukarno_Bandung_speech()

('sukarno_bandung_speech.pdf', 'already exists')


* Convert the .pdf of speech to text

In [4]:
sukarno_bandung_speech = convert_pdf_to_txt('sukarno_bandung_speech.pdf')

* subset the speech to remove citation material in document heading

In [5]:
sukarno_bandung_speech = sukarno_bandung_speech[1167:len(sukarno_bandung_speech)]
sukarno_bandung_speech[0:829]

'Your Excellencies,Ladies and Gentlemen, Sisters and Brothers.It is my great honour and privilege on this historic day to bid you welcome to Indonesia. On behalf of the people and government of Indonesia - your hosts - I beg your understanding and forbearance if some circumstances in our country do not meet your expectation. We have, I assure you, done our best to make your stay amongst us memorable for both our guests and your hosts. We hope that the warmth of our welcome will compensate for whatever material shortcomings there may be.As I survey this hall and the distinguished guests gathered here, my heart is filled with emotion. This is the first intercontinental conference of coloured peoples in the history of mankind! I am proud that my country is your host. I am happy that you were able to accept the invitations'

### Jogyakarta speech 

In [6]:
url = 'http://papuaweb.org/goi/pidato/1961-12-jogyakarta.html'
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
sukarno_jogyakarta_speech = soup.get_text().replace('\n\n','')
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech.replace('\r\n', " ")

* subset the speech to remove citation material in document heading and ending

In [7]:
sukarno_jogyakarta_speech = sukarno_jogyakarta_speech[708:len(sukarno_jogyakarta_speech)-66]

* Convert the text to a pandas dataframe 
* easy to transform pandas dataframe to graphlab sframe
* for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [8]:
speech_data = {'speech': [sukarno_bandung_speech, sukarno_jogyakarta_speech]}
index= [0,1]
speech_df = pd.DataFrame(data = speech_data, index = index)

In [9]:
speech_df

Unnamed: 0,speech
0,"Your Excellencies,Ladies and Gentlemen, Sister..."
1,"Friends As was said by the Sultan just now, t..."


In [10]:
BASE_DIR = "~/repos/statistics-indonesia-python/text_analysis/data" 

## Bag-of-words

* each document is represented by a map where the words are keys and the values are the number of occurrences.
* use pandas data frame to create graphlab sframe
* save sframe 
* transform sframe to bag-of-words (bow) model

In [11]:
sf = gl.SFrame(data=speech_df)
sf.save("%s/sukarno.sframe" % BASE_DIR)

In [12]:
bow = gl.text_analytics.count_words(sf['speech'])
sf['bow'] = bow
bow

dtype: dict
Rows: 2

## TF-IDF

* Another useful representation for text data is called TF-IDF (term frequency - inverse document frequency).
* This is a modification of the bag-of-words format where the counts are transformed into scores: 
    * words that are common across the document corpus are given low scores,  
    rare words occurring often in a document are given high scores

* TF-IDF(word,document)=N(word,document)∗log(1/∑dN(word,d)))
    * where N(w, d) is the number of times word w occurs in document d

In [13]:
sf['tfidf'] = gl.text_analytics.tf_idf(sf['bow'])
sf['tfidf']

dtype: dict
Rows: 2

## Text cleaning

* remove all words do not occur at least twice in each document using

In [14]:
docs = sf['bow'].dict_trim_by_values(2)

* GraphLab Create also contains a helper function called stopwords that returns a list of common words.
We can use SArray.docs.dict_trim_by_keys to remove these words from the documents as a preprocessing step.
NB: Currently only English words are available.

In [15]:
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

In [16]:
docs[0]

{'"live': 4,
 '-': 29,
 '/': 6,
 '21/09/2015': 6,
 '8': 7,
 'achieved': 2,
 'act': 4,
 'affairs': 3,
 'africa': 13,
 'africa,': 3,
 'african': 6,
 'ago': 5,
 'ago,': 2,
 'aims': 2,
 'alien': 2,
 'allowed': 2,
 'and,': 2,
 'answer': 2,
 'anti-colonial': 2,
 'appreciation': 2,
 'asia': 21,
 'asian': 10,
 'asian-african': 7,
 'assembled': 2,
 'attained': 2,
 'battle': 3,
 'bear': 3,
 'beg': 3,
 'behalf': 2,
 'beliefs,': 3,
 'bid': 2,
 'birthplaces': 2,
 'bitter': 2,
 'blessing': 2,
 'bombs,': 2,
 'bonds': 4,
 'brothers,': 6,
 'called': 4,
 'cannot,': 2,
 'causes,': 2,
 'changing': 2,
 'children': 2,
 'classic': 2,
 'clear': 3,
 'code': 2,
 'colonialism': 8,
 'colonialism.': 2,
 'comfort': 3,
 'common': 9,
 'completely': 3,
 'concern': 2,
 'conference': 22,
 'conference,': 3,
 'conflict': 2,
 'content': 3,
 'continents': 5,
 'continents.': 3,
 'control': 4,
 'control,': 2,
 'countries': 11,
 'countries.': 2,
 'country': 5,
 'courage': 3,
 'cultural': 2,
 'danger': 2,
 'day': 2,
 'days': 2,

* the tokenizer transforms each row into an ordered list of strings that represents the a simpler version of the Penn-Tree-Bank-style (PTB-style) tokenization of that row's document. 
* The representation of a document provided by PTB-style of tokenization is essential for sequence-tagging, parsing, bag-of-words treatment, and any text analytics task that requires word-level granularity. For a description of this style of tokenization, see [tokenization example](https://www.cis.upenn.edu/~treebank/tokenization.html.)

In [17]:
tokenized_speech = gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['speech'])
tokenized_speech

tokens
"[Your, Excellencies,Ladies, ..."
"[Friends, As, was, said, by, the, Sultan, just, ..."


## Part of Speech Extraction

* Highlight unique nouns in your text, identify adjectives with the high sentiment scores, or pull out nouns to generate candidate entities. 
The extract_parts_of_speech method parses the text in each element and extracts the words that are a given part of speech. 
For instance, to find all instances of adjectives:

In [18]:
spacy.load('en')
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['speech'],
                                                  chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech



adjectives
"{'ADJ': {'exclusive': 1, 'all': 7, 'manifold': 1, ..."
"{'ADJ': {'all': 6, 'heartfelt': 1, 'human': ..."


## Sentence Splitting

* The sentence splitter splits by sentence and outputs a list of sentences. 
This aids in anlysis at the sentence level.
For example, you may want a sentiment score for each sentence in a document.
The following command accomplishes this for you:

In [19]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['speech'])
sentences

sent
"[Your Excellencies,Ladies and Gentlemen, Sisters ..."
"[Friends As was said by the Sultan just now, ..."


## Create a topic model

In [20]:
docs = gl.text_analytics.count_words(sf['speech'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

* Learn topic model

In [21]:
model = gl.topic_model.create(docs)

In [22]:
print model.get_topics()

+-------+---------------+-----------------+
| topic |      word     |      score      |
+-------+---------------+-----------------+
|   0   |   conference  | 0.0409662151134 |
|   0   |      now,     |  0.024645013873 |
|   0   |     africa    |  0.023012893749 |
|   0   |     defeat    | 0.0164844132528 |
|   0   |  colonialism  | 0.0148522931288 |
|   1   |    nations    | 0.0267872982083 |
|   1   |     called    | 0.0161433386553 |
|   1   |     result    | 0.0143693453965 |
|   1   | asian-african | 0.0125953521377 |
|   1   |    strength   | 0.0108213588788 |
+-------+---------------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [23]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [conference, now,, africa,... |
| [nations, called, result, ... |
| [struggle, spirit, indepen... |
| [territory, irian,, author... |
| [people, indonesian, give,... |
| [asia, state, friends,, to... |
| [irian, white, countries, ... |
| [united, years, great, pol... |
| [dutch, world, peoples, pa... |
| [west, -, republic, indone... |
+-------------------------------+
[10 rows x 1 columns]



In [24]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 2187

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0177
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

* To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:


In [25]:
pred = model.predict(docs)

* Combining the above method with standard SFrame capabilities, one can use predict to find documents related to a particular topic


In [26]:
docs_in_topic_0 = docs[model.predict(docs) == 0]

In [27]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 2
[array('d', [0.11121281464530892, 0.12723112128146452, 0.09565217391304348, 0.0471395881006865, 0.06453089244851258, 0.14096109839816934, 0.0782608695652174, 0.10709382151029748, 0.16704805491990846, 0.06086956521739131]), array('d', [0.06737443854634545, 0.03879134340547162, 0.09963250306247448, 0.1175990200081666, 0.18538178848509596, 0.06941608819926501, 0.07145773785218457, 0.0865659452837893, 0.08370763576970192, 0.1800734993875051])]

In [28]:
model['vocabulary']

dtype: str
Rows: 2187
['!', 'speed', 'nations!bismillah', 'opened,', 'declare', 'million', 'president', 'africans', 'asians', 'majority.and', "humanity's", 'degradation,', 'liberation', 'solong', 'diminished', 'blessing', 'remember', 'future.', 'circumstances.let', 'flints', 'strike', 'deliberations', 'hard.', 'easy.', '"to', 'sons:', 'attain', 'safeguarded', 'evidence', 'afternoon-tea', 'abroad:', 'lie', 'falsify', 'achieve.', 'worthwhile,', 'happen,', 'problems', "other's", 'understand', 'making', 'other.if', "others'", 'profit', 'roots.', 'experience,', 'neighbours.', 'confidence.', 'warm', 'unfriendly', 'born', 'looked', 'world.failure', 'presence', 'likelihood', 'pillar', 'welfare', 'effect', 'harmony,', 'way,in', 'hold', 'discussion,ways', 'friendly,', 'brings', 'diversity"', 'nation.so,', 'motto', 'god,', 'toradjas,', 'bhinneka', 'madurese,', 'bataks,', 'achenese,', 'units,', 'ethnic', 'moreover,', 'christians,', 'wehave', 'acting', 'lives,', 'large', 'hardwon', 'bulwark', 'sour

In [29]:
model['topics']

topic_probabilities,vocabulary
"[0.000163212012404, 0.00195139258471, ...",!
"[0.000163212012404, 0.000177399325883, ...",speed
"[0.000163212012404, 0.000177399325883, ...",nations!bismillah
"[0.00179533213645, 0.000177399325883, ...","opened,"
"[0.00179533213645, 0.000177399325883, ...",declare
"[0.000163212012404, 0.000177399325883, ...",million
"[0.000163212012404, 0.000177399325883, ...",president
"[0.00179533213645, 0.000177399325883, ...",africans
"[0.000163212012404, 0.000177399325883, ...",asians
"[0.00179533213645, 0.000177399325883, ...",majority.and


* save and load models

In [30]:
os.chdir("../text_analysis/data/")

In [31]:
model.save('sukarno_model')
sukarno_model = gl.load_model('sukarno_model')