In [1]:
import os
import re
import urllib2
import pandas as pd
import graphlab as gl
from bps_utils import get_Sukarno_speech, convert_pdf_to_txt

In [2]:
os.chdir('../data/')

In [3]:
get_Sukarno_speech()

('sukarno_speech.pdf', 'already exists')


In [4]:
sukarno_speech = convert_pdf_to_txt('sukarno_speech.pdf')

In [5]:
sukarno_speech[1167:2009]

'Your Excellencies,Ladies and Gentlemen, Sisters and Brothers.It is my great honour and privilege on this historic day to bid you welcome to Indonesia. On behalf of the people and government of Indonesia - your hosts - I beg your understanding and forbearance if some circumstances in our country do not meet your expectation. We have, I assure you, done our best to make your stay amongst us memorable for both our guests and your hosts. We hope that the warmth of our welcome will compensate for whatever material shortcomings there may be.As I survey this hall and the distinguished guests gathered here, my heart is filled with emotion. This is the first intercontinental conference of coloured peoples in the history of mankind! I am proud that my country is your host. I am happy that you were able to accept the invitations extended by '

In [6]:
d = {'col1': sukarno_speech}
index= [1]
df = pd.DataFrame(data = d, index = index)

In [7]:
BASE_DIR = "~/repos/statistics-indonesia-python/sentiment_analysis/data" 

for more info on text analysis with turi sframes see this [documentation](https://turi.com/learn/userguide/text/intro.html)

In [8]:
sf = gl.SFrame(data=df)
sf.save("%s/sukarno.sframe" % BASE_DIR)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1469837069.log


This non-commercial license of GraphLab Create for academic use is assigned to kpolimis@u.washington.edu and will expire on July 29, 2017.


In [9]:
bow = gl.text_analytics.count_words(sf['col1'])
bow

dtype: dict
Rows: 1

In [10]:
tokenized_speech= gl.SFrame()
tokenized_speech['tokens'] = gl.text_analytics.tokenize(sf['col1'])
tokenized_speech

tokens
"[Address, given, by, Sukarno, (Bandung,, 18, ..."


## Part of Speech Extraction

In [11]:
parts_of_speech = gl.SFrame()
parts_of_speech['adjectives'] = gl.text_analytics.extract_parts_of_speech(sf['col1'],chosen_pos=[gl.text_analytics.PartOfSpeech.ADJ])
parts_of_speech

adjectives
"{'ADJ': {'exclusive': 2, 'all': 8, 'manifold': 1, ..."


## Sentence Splitting

In [12]:
sentences = gl.SFrame()
sentences['sent'] = gl.text_analytics.split_by_sentence(sf['col1'])
sentences

sent
"[Address given by Sukarno (Bandung, 18 April 19 ..."


## Create a topic model

In [13]:
docs = gl.text_analytics.count_words(sf['col1'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

# Learn topic model
model = gl.topic_model.create(docs)

In [14]:
print model.get_topics()

+-------+---------------+-----------------+
| topic |      word     |      score      |
+-------+---------------+-----------------+
|   0   |      part     | 0.0317310713164 |
|   0   | independence, | 0.0160226201697 |
|   0   |    sisters    | 0.0160226201697 |
|   0   |     world,    | 0.0128809299403 |
|   0   |    highest    | 0.0128809299403 |
|   1   |   countries   | 0.0313228061092 |
|   1   |     years     | 0.0183794977996 |
|   1   |      made     | 0.0183794977996 |
|   1   |      hope     | 0.0183794977996 |
|   1   |    foreign    | 0.0132021744758 |
+-------+---------------+-----------------+
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [15]:
print model.get_topics(output_type='topic_words')

+-------------------------------+
|             words             |
+-------------------------------+
| [part, sisters, independen... |
| [countries, years, made, h... |
| [8, african, learned, stre... |
| [-, conference, independen... |
| [nations, africa, long, po... |
| [asia, man, give, 21/09/20... |
| [great, common, parts, gat... |
| [peoples, world, struggle,... |
| [today, asian-african, man... |
| [asian, /, understanding, ... |
+-------------------------------+
[10 rows x 1 columns]



In [16]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 1413

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0136
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

In [17]:
#To predict the topic of a given document, one can get an SArray of integers containing the most probable topic ids:

pred = model.predict(docs)
#Combining the above method with standard SFrame capabilities,
#one can use predict to find documents related to a particular topic

docs_in_topic_0 = docs[model.predict(docs) == 0]

In [18]:
pred = model.predict(docs, output_type='probability')
pred

dtype: array
Rows: 1
[array('d', [0.07321350284962735, 0.11223147742218326, 0.09907935116177115, 0.1310828583954406, 0.10214818062253397, 0.12099956159579132, 0.08198158702323542, 0.0933800964489259, 0.07496711968434897, 0.11091626479614204])]

In [19]:
model['vocabulary']

dtype: str
Rows: 1413
['!', 'speed', '!god', 'nations!bismillah', 'profitable', 'declare', 'united.as', 'majority.and', "humanity's", 'development', 'stunted', 'physical,', 'degradation,', 'bonds', 'liberation', 'solong', 'diminished', 'liberty.', 'sweet', 'remember', 'firmly', 'flints', 'strike', 'deliberations', 'ah,', 'hard.', 'easy.', '"to', 'sons:', 'rather,', 'subject', 'attain', 'safeguarded', 'yes,', 'evidence', 'afternoon-tea', 'abroad:', 'lie', 'falsify', 'achieve.', 'worthwhile,', 'happen,', 'problems', "other's", 'understand', "indonesia's", 'making', 'balinese,', 'other.if', 'consideration.', "others'", 'profit', 'roots.', 'experience,', 'neighbours.', 'confidence.', 'warm', 'unfriendly', 'born', 'looked', 'world.failure', 'price', 'presence', 'likelihood', 'pillar', 'welfare', 'effect', 'miles', 'harmony,', 'way,in', 'hold', 'discussion,ways', 'recently', 'friendly,', 'condition.yes,', 'brings', 'motives.how', 'diversity"', 'nation.so,', 'motto', 'god,', 'toradjas,', 'bhi

In [20]:
model['topics']

topic_probabilities,vocabulary
"[0.00345585925228, 0.000258866166192, ...",!
"[0.000314169022934, 0.000258866166192, ...",speed
"[0.000314169022934, 0.00284752782811, ...",!god
"[0.000314169022934, 0.000258866166192, ...",nations!bismillah
"[0.000314169022934, 0.000258866166192, ...",profitable
"[0.000314169022934, 0.000258866166192, ...",declare
"[0.000314169022934, 0.000258866166192, ...",united.as
"[0.00345585925228, 0.000258866166192, ...",majority.and
"[0.000314169022934, 0.000258866166192, ...",humanity's
"[0.000314169022934, 0.000258866166192, ...",development


In [21]:
#As with other models in GraphLab Create, it's also easy to save and load models.

model.save('my_model')
new_model = gl.load_model('my_model')